2 anos atrás · d183a1a83a
--- a/docs/multi_gpu.md
+++ b/docs/multi_gpu.md
@@ -24,7 +24,7 @@ This runs with the `samsum_dataset` for summarization application by default.
 
				 
			
 
				 ```bash
			
 
				 
			
 
				-torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp --model_name /patht_of_model_folder/8B --use_peft --peft_method lora --output_dir Path/to/save/PEFT/model
			
 
				+torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --use_peft --peft_method lora --output_dir Path/to/save/PEFT/model
			
 
				 
			
 
				 ```
			
 
				 
			
@@ -43,7 +43,7 @@ We use `torchrun` here to spawn multiple processes for FSDP.
 
				 Setting `use_fast_kernels` will enable using of Flash Attention or Xformer memory-efficient kernels based on the hardware being used. This would speed up the fine-tuning job. This has been enabled in `optimum` library from HuggingFace as a one-liner API, please read more [here](https://pytorch.org/blog/out-of-the-box-acceleration/).
			
 
				 
			
 
				 ```bash
			
 
				-torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp --model_name /patht_of_model_folder/8B --use_peft --peft_method lora --output_dir Path/to/save/PEFT/model --use_fast_kernels
			
 
				+torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --use_peft --peft_method lora --output_dir Path/to/save/PEFT/model --use_fast_kernels
			
 
				 ```
			
 
				 
			
 
				 ### Fine-tuning using FSDP Only
			
@@ -52,7 +52,7 @@ If interested in running full parameter finetuning without making use of PEFT me
 
				 
			
 
				 ```bash
			
 
				 
			
 
				-torchrun --nnodes 1 --nproc_per_node 8  examples/finetuning.py --enable_fsdp --model_name /patht_of_model_folder/8B --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --use_fast_kernels
			
 
				+torchrun --nnodes 1 --nproc_per_node 8  examples/finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --use_fast_kernels
			
 
				 
			
 
				 ```
			
 
				 
			
@@ -62,7 +62,7 @@ If you are interested in running full parameter fine-tuning on the 70B model, yo
 
				 
			
 
				 ```bash
			
 
				 
			
 
				-torchrun --nnodes 1 --nproc_per_node 8 examples/finetuning.py --enable_fsdp --low_cpu_fsdp --pure_bf16 --model_name /patht_of_model_folder/70B --batch_size_training 1 --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned
			
 
				+torchrun --nnodes 1 --nproc_per_node 8 examples/finetuning.py --enable_fsdp --low_cpu_fsdp --pure_bf16 --model_name /path_of_model_folder/70B --batch_size_training 1 --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned
			
 
				 
			
 
				 ```
			
 
				 
			
@@ -95,16 +95,16 @@ To run with each of the datasets set the `dataset` flag in the command as shown
 
				 
			
 
				 ```bash
			
 
				 # grammer_dataset
			
 
				-torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp  --model_name /patht_of_model_folder/8B --use_peft --peft_method lora --dataset grammar_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned  --pure_bf16 --output_dir Path/to/save/PEFT/model
			
 
				+torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp  --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset grammar_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned  --pure_bf16 --output_dir Path/to/save/PEFT/model
			
 
				 
			
 
				 # alpaca_dataset
			
 
				 
			
 
				-torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp  --model_name /patht_of_model_folder/8B --use_peft --peft_method lora --dataset alpaca_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --output_dir Path/to/save/PEFT/model
			
 
				+torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp  --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset alpaca_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --output_dir Path/to/save/PEFT/model
			
 
				 
			
 
				 
			
 
				 # samsum_dataset
			
 
				 
			
 
				-torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp --model_name /patht_of_model_folder/8B --use_peft --peft_method lora --dataset samsum_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --output_dir Path/to/save/PEFT/model
			
 
				+torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset samsum_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --output_dir Path/to/save/PEFT/model
			
 
				 
			
 
				 ```
			
 
				 
			
--- a/docs/single_gpu.md
+++ b/docs/single_gpu.md
@@ -20,7 +20,7 @@ Get access to a machine with one GPU or if using a multi-GPU machine please make
 
				 
			
 
				 ```bash
			
 
				 
			
 
				-python -m llama_recipes.finetuning  --use_peft --peft_method lora --quantization --use_fp16 --model_name /patht_of_model_folder/8B --output_dir Path/to/save/PEFT/model
			
 
				+python -m llama_recipes.finetuning  --use_peft --peft_method lora --quantization --use_fp16 --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model
			
 
				 
			
 
				 ```
			
 
				 The args used in the command above are:
			
@@ -51,16 +51,16 @@ to run with each of the datasets set the `dataset` flag in the command as shown
 
				 ```bash
			
 
				 # grammer_dataset
			
 
				 
			
 
				-python -m llama_recipes.finetuning  --use_peft --peft_method lora --quantization  --dataset grammar_dataset --model_name /patht_of_model_folder/8B --output_dir Path/to/save/PEFT/model
			
 
				+python -m llama_recipes.finetuning  --use_peft --peft_method lora --quantization  --dataset grammar_dataset --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model
			
 
				 
			
 
				 # alpaca_dataset
			
 
				 
			
 
				-python -m llama_recipes.finetuning  --use_peft --peft_method lora --quantization  --dataset alpaca_dataset --model_name /patht_of_model_folder/8B --output_dir Path/to/save/PEFT/model
			
 
				+python -m llama_recipes.finetuning  --use_peft --peft_method lora --quantization  --dataset alpaca_dataset --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model
			
 
				 
			
 
				 
			
 
				 # samsum_dataset
			
 
				 
			
 
				-python -m llama_recipes.finetuning  --use_peft --peft_method lora --quantization  --dataset samsum_dataset --model_name /patht_of_model_folder/8B --output_dir Path/to/save/PEFT/model
			
 
				+python -m llama_recipes.finetuning  --use_peft --peft_method lora --quantization  --dataset samsum_dataset --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model
			
 
				 
			
 
				 ```
			
 
				 
			
--- a/recipes/README.md
+++ b/recipes/README.md
--- a/recipes/benchmarks/inference_throughput/on-prem/vllm/chat_vllm_benchmark.py
+++ b/recipes/benchmarks/inference_throughput/on-prem/vllm/chat_vllm_benchmark.py
@@ -40,8 +40,6 @@ MODEL_HEADERS = params["MODEL_HEADERS"]
 
				 SAFE_CHECK = params["SAFE_CHECK"]
			
 
				 # Threshold for tokens per second below which we deem the query to be slow
			
 
				 THRESHOLD_TPS = params["THRESHOLD_TPS"] 
			
 
				-# Default Llama tokenizer, replace with your own tokenizer 
			
 
				-TOKENIZER_PATH = params["TOKENIZER_PATH"] 
			
 
				 TEMPERATURE = params["TEMPERATURE"]
			
 
				 TOP_P = params["TOP_P"]
			
 
				 # Add your model endpoints here, specify the port number. You can acquire the endpoint when creating a on-prem server like vLLM.
			
@@ -55,8 +53,8 @@ else:
 
				     print("No available GPUs")
			
 
				 
			
 
				 
			
 
				-# This tokenizer is downloaded from Azure model catalog for each specific models. The main purpose is to decode the reponses for token calculation
			
 
				-tokenizer = transformers.AutoTokenizer.from_pretrained(TOKENIZER_PATH)
			
 
				+# This tokenizer is downloaded from HuggingFace based on the model path you set. Note Llama 3 use a different tokenizer compare to Llama 2
			
 
				+tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_PATH)
			
 
				 
			
 
				 num_token_input_prompt = len(tokenizer.encode(PROMPT))
			
 
				 print(f"Number of token for input prompt: {num_token_input_prompt}")
			
--- a/recipes/benchmarks/inference_throughput/on-prem/vllm/parameters.json
+++ b/recipes/benchmarks/inference_throughput/on-prem/vllm/parameters.json
@@ -5,7 +5,6 @@
 
				     "MODEL_HEADERS" : {"Content-Type": "application/json"},
			
 
				     "SAFE_CHECK" : true,
			
 
				     "THRESHOLD_TPS" : 7,
			
 
				-    "TOKENIZER_PATH" : "../../tokenizer",
			
 
				     "RANDOM_PROMPT_LENGTH" : 1000,
			
 
				     "TEMPERATURE" : 0.6,
			
 
				     "TOP_P" : 0.9,
			
--- a/recipes/benchmarks/inference_throughput/on-prem/vllm/pretrained_vllm_benchmark.py
+++ b/recipes/benchmarks/inference_throughput/on-prem/vllm/pretrained_vllm_benchmark.py
@@ -36,8 +36,6 @@ MODEL_HEADERS = params["MODEL_HEADERS"]
 
				 SAFE_CHECK = params["SAFE_CHECK"]
			
 
				 # Threshold for tokens per second below which we deem the query to be slow
			
 
				 THRESHOLD_TPS = params["THRESHOLD_TPS"] 
			
 
				-# Replace with your own tokenizer 
			
 
				-TOKENIZER_PATH = params["TOKENIZER_PATH"] 
			
 
				 RANDOM_PROMPT_LENGTH = params["RANDOM_PROMPT_LENGTH"]
			
 
				 TEMPERATURE = params["TEMPERATURE"]
			
 
				 TOP_P = params["TOP_P"]
			
@@ -52,8 +50,8 @@ else:
 
				     print("No available GPUs")
			
 
				 
			
 
				 
			
 
				-# This tokenizer is downloaded from Azure model catalog for each specific models. The main purpose is to decode the reponses for token calculation
			
 
				-tokenizer = transformers.AutoTokenizer.from_pretrained(TOKENIZER_PATH)
			
 
				+# This tokenizer is downloaded from HuggingFace based on the model path you set. Note Llama 3 use a different tokenizer compare to Llama 2
			
 
				+tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_PATH)
			
 
				 
			
 
				 # Select vocabulary that is longer than 2 tokens (closer to real words) and close to the English (not foolproof)
			
 
				 vocab = [token for token in tokenizer.get_vocab().keys() if len(token) > 2 and all(ord(c) < 128 for c in token)]
			
--- a/recipes/finetuning/README.md
+++ b/recipes/finetuning/README.md
@@ -99,7 +99,7 @@ It lets us specify the training settings for everything from `model_name` to `da
 
				 You can enable [W&B](https://wandb.ai/) experiment tracking by using `use_wandb` flag as below. You can change the project name, entity and other `wandb.init` arguments in `wandb_config`.
			
 
				 
			
 
				 ```bash
			
 
				-python -m llama_recipes.finetuning --use_peft --peft_method lora --quantization --model_name /patht_of_model_folder/8B --output_dir Path/to/save/PEFT/model --use_wandb
			
 
				+python -m llama_recipes.finetuning --use_peft --peft_method lora --quantization --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model --use_wandb
			
 
				 ```
			
 
				 You'll be able to access a dedicated project or run link on [wandb.ai](https://wandb.ai) and see your dashboard like the one below.
			
 
				 <div style="display: flex;">
			
--- a/recipes/finetuning/multigpu_finetuning.md
+++ b/recipes/finetuning/multigpu_finetuning.md
@@ -23,7 +23,7 @@ Get access to a machine with multiple GPUs (in this case we tested with 4 A100 a
 
				 <details open>
			
 
				 <summary>Single-node Multi-GPU</summary>
			
 
				 
			
 
				-    torchrun --nnodes 1 --nproc_per_node 4  finetuning.py --enable_fsdp --model_name /patht_of_model_folder/8B --use_peft --peft_method lora --output_dir Path/to/save/PEFT/model
			
 
				+    torchrun --nnodes 1 --nproc_per_node 4  finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --use_peft --peft_method lora --output_dir Path/to/save/PEFT/model
			
 
				 
			
 
				 </details>
			
 
				 
			
@@ -49,7 +49,7 @@ The args used in the command above are:
 
				 If interested in running full parameter finetuning without making use of PEFT methods, please use the following command. Make sure to change the `nproc_per_node` to your available GPUs. This has been tested with `BF16` on 8xA100, 40GB GPUs.
			
 
				 
			
 
				 ```bash
			
 
				-torchrun --nnodes 1 --nproc_per_node 8  finetuning.py --enable_fsdp --model_name /patht_of_model_folder/8B --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --use_fast_kernels
			
 
				+torchrun --nnodes 1 --nproc_per_node 8  finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --use_fast_kernels
			
 
				 ```
			
 
				 
			
 
				 ### Using less CPU memory (FSDP on 70B model)
			
@@ -57,7 +57,7 @@ torchrun --nnodes 1 --nproc_per_node 8  finetuning.py --enable_fsdp --model_name
 
				 If you are running full parameter fine-tuning on the 70B model, you can enable `low_cpu_fsdp` mode as the following command. This option will load model on rank0 only before moving model to devices to construct FSDP. This can dramatically save cpu memory when loading large models like 70B (on a 8-gpu node, this reduces cpu memory from 2+T to 280G for 70B model). This has been tested with `BF16` on 16xA100, 80GB GPUs.
			
 
				 
			
 
				 ```bash
			
 
				-torchrun --nnodes 1 --nproc_per_node 8 finetuning.py --enable_fsdp --low_cpu_fsdp --pure_bf16 --model_name /patht_of_model_folder/70B --batch_size_training 1 --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned
			
 
				+torchrun --nnodes 1 --nproc_per_node 8 finetuning.py --enable_fsdp --low_cpu_fsdp --pure_bf16 --model_name /path_of_model_folder/70B --batch_size_training 1 --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned
			
 
				 ```
			
 
				 
			
 
				 
			
@@ -79,16 +79,16 @@ To run with each of the datasets set the `dataset` flag in the command as shown
 
				 
			
 
				 ```bash
			
 
				 # grammer_dataset
			
 
				-torchrun --nnodes 1 --nproc_per_node 4  finetuning.py --enable_fsdp  --model_name /patht_of_model_folder/8B --use_peft --peft_method lora --dataset grammar_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned  --pure_bf16 --output_dir Path/to/save/PEFT/model
			
 
				+torchrun --nnodes 1 --nproc_per_node 4  finetuning.py --enable_fsdp  --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset grammar_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned  --pure_bf16 --output_dir Path/to/save/PEFT/model
			
 
				 
			
 
				 # alpaca_dataset
			
 
				 
			
 
				-torchrun --nnodes 1 --nproc_per_node 4  finetuning.py --enable_fsdp  --model_name /patht_of_model_folder/8B --use_peft --peft_method lora --dataset alpaca_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --output_dir Path/to/save/PEFT/model
			
 
				+torchrun --nnodes 1 --nproc_per_node 4  finetuning.py --enable_fsdp  --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset alpaca_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --output_dir Path/to/save/PEFT/model
			
 
				 
			
 
				 
			
 
				 # samsum_dataset
			
 
				 
			
 
				-torchrun --nnodes 1 --nproc_per_node 4  finetuning.py --enable_fsdp --model_name /patht_of_model_folder/8B --use_peft --peft_method lora --dataset samsum_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --output_dir Path/to/save/PEFT/model
			
 
				+torchrun --nnodes 1 --nproc_per_node 4  finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset samsum_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --output_dir Path/to/save/PEFT/model
			
 
				 
			
 
				 ```
			
 
				 
			
@@ -103,7 +103,7 @@ This will require to set the Sharding strategy in [fsdp config](../../src/llama_
 
				 
			
 
				 ```bash
			
 
				 
			
 
				-torchrun --nnodes 4 --nproc_per_node 8 ./finetuning.py --enable_fsdp --low_cpu_fsdp --fsdp_config.pure_bf16 --model_name /patht_of_model_folder/70B --batch_size_training 1 --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --hsdp --sharding_group_size n --replica_group_size world_size/n
			
 
				+torchrun --nnodes 4 --nproc_per_node 8 ./finetuning.py --enable_fsdp --low_cpu_fsdp --fsdp_config.pure_bf16 --model_name /path_of_model_folder/70B --batch_size_training 1 --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --hsdp --sharding_group_size n --replica_group_size world_size/n
			
 
				 
			
 
				 ```
			
 
				 
			
--- a/recipes/finetuning/singlegpu_finetuning.md
+++ b/recipes/finetuning/singlegpu_finetuning.md
@@ -16,7 +16,7 @@ To run fine-tuning on a single GPU, we will make use of two packages:
 
				 ## How to run it?
			
 
				 
			
 
				 ```bash
			
 
				-python -m finetuning.py  --use_peft --peft_method lora --quantization --use_fp16 --model_name /patht_of_model_folder/8B --output_dir Path/to/save/PEFT/model
			
 
				+python -m finetuning.py  --use_peft --peft_method lora --quantization --use_fp16 --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model
			
 
				 ```
			
 
				 The args used in the command above are:
			
 
				 
			
@@ -48,16 +48,16 @@ to run with each of the datasets set the `dataset` flag in the command as shown
 
				 ```bash
			
 
				 # grammer_dataset
			
 
				 
			
 
				-python -m finetuning.py  --use_peft --peft_method lora --quantization  --dataset grammar_dataset --model_name /patht_of_model_folder/8B --output_dir Path/to/save/PEFT/model
			
 
				+python -m finetuning.py  --use_peft --peft_method lora --quantization  --dataset grammar_dataset --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model
			
 
				 
			
 
				 # alpaca_dataset
			
 
				 
			
 
				-python -m finetuning.py  --use_peft --peft_method lora --quantization  --dataset alpaca_dataset --model_name /patht_of_model_folder/8B --output_dir Path/to/save/PEFT/model
			
 
				+python -m finetuning.py  --use_peft --peft_method lora --quantization  --dataset alpaca_dataset --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model
			
 
				 
			
 
				 
			
 
				 # samsum_dataset
			
 
				 
			
 
				-python -m finetuning.py  --use_peft --peft_method lora --quantization  --dataset samsum_dataset --model_name /patht_of_model_folder/8B --output_dir Path/to/save/PEFT/model
			
 
				+python -m finetuning.py  --use_peft --peft_method lora --quantization  --dataset samsum_dataset --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model
			
 
				 
			
 
				 ```
			
 
				 
			
--- a/recipes/inference/llama_web_ui/Llama2_Gradio.ipynb
+++ b/recipes/inference/llama_web_ui/Llama2_Gradio.ipynb
@@ -1,130 +0,0 @@
 
				-{
			
 
				- "cells": [
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "e4532411",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# TODO REFACTOR: Integrate code from _legacy/inference.py into this notebook"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "47a9adb3",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "## This demo app shows how to query Llama 2 using the Gradio UI.\n",
			
 
				-    "\n",
			
 
				-    "Since we are using Replicate in this example, you will need to replace `<your replicate api token>` with your API token.\n",
			
 
				-    "\n",
			
 
				-    "To get the Replicate token: \n",
			
 
				-    "\n",
			
 
				-    "- You will need to first sign in with Replicate with your github account\n",
			
 
				-    "- Then create a free API token [here](https://replicate.com/account/api-tokens) that you can use for a while \n",
			
 
				-    "\n",
			
 
				-    "**Note** After the free trial ends, you will need to enter billing info to continue to use Llama2 hosted on Replicate.\n",
			
 
				-    "\n",
			
 
				-    "To run this example:\n",
			
 
				-    "- Set up your Replicate API token and enter it in place of `<your replicate api token>`\n",
			
 
				-    "- Run the notebook\n",
			
 
				-    "- Enter your question and click Submit\n",
			
 
				-    "\n",
			
 
				-    "In the notebook or a browser with URL http://127.0.0.1:7860 you should see a UI with your answer."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": 1,
			
 
				-   "id": "928041cc",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [
			
 
				-    {
			
 
				-     "name": "stderr",
			
 
				-     "output_type": "stream",
			
 
				-     "text": [
			
 
				-      "Init param `input` is deprecated, please use `model_kwargs` instead.\n"
			
 
				-     ]
			
 
				-    },
			
 
				-    {
			
 
				-     "name": "stdout",
			
 
				-     "output_type": "stream",
			
 
				-     "text": [
			
 
				-      "Running on local URL:  http://127.0.0.1:7860\n",
			
 
				-      "\n",
			
 
				-      "To create a public link, set `share=True` in `launch()`.\n"
			
 
				-     ]
			
 
				-    },
			
 
				-    {
			
 
				-     "data": {
			
 
				-      "text/html": [
			
 
				-       "<div><iframe src=\"http://127.0.0.1:7860/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
			
 
				-      ],
			
 
				-      "text/plain": [
			
 
				-       "<IPython.core.display.HTML object>"
			
 
				-      ]
			
 
				-     },
			
 
				-     "metadata": {},
			
 
				-     "output_type": "display_data"
			
 
				-    },
			
 
				-    {
			
 
				-     "data": {
			
 
				-      "text/plain": []
			
 
				-     },
			
 
				-     "execution_count": 1,
			
 
				-     "metadata": {},
			
 
				-     "output_type": "execute_result"
			
 
				-    }
			
 
				-   ],
			
 
				-   "source": [
			
 
				-    "from langchain.schema import AIMessage, HumanMessage\n",
			
 
				-    "import gradio as gr\n",
			
 
				-    "from langchain.llms import Replicate\n",
			
 
				-    "import os\n",
			
 
				-    "\n",
			
 
				-    "os.environ[\"REPLICATE_API_TOKEN\"] = \"<your replicate api token>\"\n",
			
 
				-    "\n",
			
 
				-    "llama2_13b_chat = \"meta/llama-2-13b-chat:f4e2de70d66816a838a89eeeb621910adffb0dd0baba3976c96980970978018d\"\n",
			
 
				-    "\n",
			
 
				-    "llm = Replicate(\n",
			
 
				-    "    model=llama2_13b_chat,\n",
			
 
				-    "    model_kwargs={\"temperature\": 0.01, \"top_p\": 1, \"max_new_tokens\":500}\n",
			
 
				-    ")\n",
			
 
				-    "\n",
			
 
				-    "\n",
			
 
				-    "def predict(message, history):\n",
			
 
				-    "    history_langchain_format = []\n",
			
 
				-    "    for human, ai in history:\n",
			
 
				-    "        history_langchain_format.append(HumanMessage(content=human))\n",
			
 
				-    "        history_langchain_format.append(AIMessage(content=ai))\n",
			
 
				-    "    history_langchain_format.append(HumanMessage(content=message))\n",
			
 
				-    "    gpt_response = llm(message) #history_langchain_format)\n",
			
 
				-    "    return gpt_response#.content\n",
			
 
				-    "\n",
			
 
				-    "gr.ChatInterface(predict).launch()"
			
 
				-   ]
			
 
				-  }
			
 
				- ],
			
 
				- "metadata": {
			
 
				-  "kernelspec": {
			
 
				-   "display_name": "Python 3 (ipykernel)",
			
 
				-   "language": "python",
			
 
				-   "name": "python3"
			
 
				-  },
			
 
				-  "language_info": {
			
 
				-   "codemirror_mode": {
			
 
				-    "name": "ipython",
			
 
				-    "version": 3
			
 
				-   },
			
 
				-   "file_extension": ".py",
			
 
				-   "mimetype": "text/x-python",
			
 
				-   "name": "python",
			
 
				-   "nbconvert_exporter": "python",
			
 
				-   "pygments_lexer": "ipython3",
			
 
				-   "version": "3.8.18"
			
 
				-  }
			
 
				- },
			
 
				- "nbformat": 4,
			
 
				- "nbformat_minor": 5
			
 
				-}
			
--- a/recipes/inference/llama_web_ui/README.md
+++ b/recipes/inference/llama_web_ui/README.md
@@ -1,25 +0,0 @@
 
				-## Quick Web UI for Llama2 Chat
			
 
				-If you prefer to see Llama2 in action in a web UI, instead of the notebooks above, you can try one of the two methods:
			
 
				-
			
 
				-### Running [Streamlit](https://streamlit.io/) with Llama2
			
 
				-Open a Terminal, run the following commands:
			
 
				-```
			
 
				-pip install streamlit langchain replicate
			
 
				-git clone https://github.com/facebookresearch/llama-recipes
			
 
				-cd llama-recipes/llama-demo-apps
			
 
				-```
			
 
				-
			
 
				-Replace the `<your replicate api token>` in `streamlit_llama2.py` with your API token created [here](https://replicate.com/account/api-tokens) - for more info, see the note [above](#replicate_note).
			
 
				-
			
 
				-Then run the command `streamlit run streamlit_llama2.py` and you'll see on your browser the following UI with question and answer - you can enter new text question, click Submit, and see Llama2's answer:
			
 
				-
			
 
				-![](../../../docs/images/llama2-streamlit.png)
			
 
				-![](../../../docs/images/llama2-streamlit2.png)
			
 
				-
			
 
				-### Running [Gradio](https://www.gradio.app/) with Llama2 (using [Replicate](Llama2_Gradio.ipynb) or [OctoAI](../../llama_api_providers/OctoAI_API_examples/Llama2_Gradio.ipynb))
			
 
				-
			
 
				-To see how to query Llama2 and get answers with the Gradio UI both from the notebook and web, just launch the notebook `Llama2_Gradio.ipynb`. For more info, on how to get set up with a token to power these apps, see the note on [Replicate](../../README.md#replicate_note) and [OctoAI](../../README.md##octoai_note).
			
 
				-
			
 
				-Then enter your question, click Submit. You'll see in the notebook or a browser with URL http://127.0.0.1:7860 the following UI:
			
 
				-
			
 
				-![](../../../docs/images/llama2-gradio.png)
			
--- a/recipes/inference/llama_web_ui/requirements.txt
+++ b/recipes/inference/llama_web_ui/requirements.txt
@@ -1,3 +0,0 @@
 
				-streamlit
			
 
				-langchain
			
 
				-replicate
			
--- a/recipes/inference/llama_web_ui/streamlit_llama2.py
+++ b/recipes/inference/llama_web_ui/streamlit_llama2.py
@@ -1,27 +0,0 @@
 
				-# Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				-# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
			
 
				-
			
 
				-# TODO REFACTOR: Convert this to an ipynb notebook
			
 
				-
			
 
				-import streamlit as st
			
 
				-from langchain.llms import Replicate
			
 
				-import os
			
 
				-
			
 
				-st.title("Llama2-powered Streamlit App")
			
 
				-
			
 
				-with st.sidebar:
			
 
				-    os.environ["REPLICATE_API_TOKEN"] = "<your replicate api token>"
			
 
				-
			
 
				-def generate_response(input_text):
			
 
				-    llama2_13b_chat = "meta/llama-2-13b-chat:f4e2de70d66816a838a89eeeb621910adffb0dd0baba3976c96980970978018d"
			
 
				-
			
 
				-    llm = Replicate(
			
 
				-        model=llama2_13b_chat,
			
 
				-        model_kwargs={"temperature": 0.01, "top_p": 1, "max_new_tokens":500}
			
 
				-    )
			
 
				-    st.info(llm(input_text))
			
 
				-
			
 
				-with st.form("my_form"):
			
 
				-    text = st.text_area("Enter text:", "What is Generative AI?")
			
 
				-    submitted = st.form_submit_button("Submit")
			
 
				-    generate_response(text)
			
--- a/recipes/inference/local_inference/README.md
+++ b/recipes/inference/local_inference/README.md
@@ -69,7 +69,7 @@ In case you have fine-tuned your model with pure FSDP and saved the checkpoints
 
				 This is helpful if you have fine-tuned you model using FSDP only as follows:
			
 
				 
			
 
				 ```bash
			
 
				-torchrun --nnodes 1 --nproc_per_node 8  recipes/finetuning/finetuning.py --enable_fsdp --model_name /patht_of_model_folder/7B --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16
			
 
				+torchrun --nnodes 1 --nproc_per_node 8  recipes/finetuning/finetuning.py --enable_fsdp --model_name /path_of_model_folder/7B --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16
			
 
				 ```
			
 
				 Then convert your FSDP checkpoint to HuggingFace checkpoints using:
			
 
				 ```bash
			
--- a/recipes/inference/model_servers/README.md
+++ b/recipes/inference/model_servers/README.md
@@ -1,4 +1,2 @@
 
				-## [Running Llama2 On-Prem with vLLM and TGI](llama-on-prem.md)
			
 
				-This tutorial shows how to use Llama 2 with [vLLM](https://github.com/vllm-project/vllm) and Hugging Face [TGI](https://github.com/huggingface/text-generation-inference) to build Llama 2 on-prem apps.
			
 
				-
			
 
				-\* To run a quantized Llama2 model on iOS and Android, you can use  the open source [MLC LLM](https://github.com/mlc-ai/mlc-llm) or [llama.cpp](https://github.com/ggerganov/llama.cpp). You can even make a Linux OS that boots to Llama2 ([repo](https://github.com/trholding/llama2.c)).
			
 
				+## [Running Llama 3 On-Prem with vLLM and TGI](llama-on-prem.md)
			
 
				+This tutorial shows how to use Llama 3 with [vLLM](https://github.com/vllm-project/vllm) and Hugging Face [TGI](https://github.com/huggingface/text-generation-inference) to build Llama 3 on-prem apps.
			
--- a/recipes/llama_api_providers/Azure_API_example/azure_api_example.ipynb
+++ b/recipes/llama_api_providers/Azure_API_example/azure_api_example.ipynb
@@ -4,13 +4,14 @@
 
				    "cell_type": "markdown",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				-    "# Use Azure API with Llama 2\n",
			
 
				+    "# Use Azure API with Llama 3\n",
			
 
				     "\n",
			
 
				-    "This notebook shows examples of how to use Llama 2 APIs offered by Microsoft Azure. We will cover:  \n",
			
 
				-    "* HTTP requests API usage for Llama 2 pretrained and chat models in CLI\n",
			
 
				-    "* HTTP requests API usage for Llama 2 pretrained and chat models in Python\n",
			
 
				+    "This notebook shows examples of how to use Llama 3 APIs offered by Microsoft Azure. We will cover:  \n",
			
 
				+    "* HTTP requests API usage for Llama 3 instruct models in CLI\n",
			
 
				+    "* HTTP requests API usage for Llama 3 instruct models in Python\n",
			
 
				     "* Plug the APIs into LangChain\n",
			
 
				     "* Wire the model with Gradio to build a simple chatbot with memory\n",
			
 
				+    "\n",
			
 
				     "\n"
			
 
				    ]
			
 
				   },
			
@@ -20,15 +21,13 @@
 
				    "source": [
			
 
				     "## Prerequisite\n",
			
 
				     "\n",
			
 
				-    "Before we start building with Azure Llama 2 APIs, there are certain steps we need to take to deploy the models:\n",
			
 
				+    "Before we start building with Azure Llama 3 APIs, there are certain steps we need to take to deploy the models:\n",
			
 
				     "\n",
			
 
				     "* Register for a valid Azure account with subscription [here](https://azure.microsoft.com/en-us/free/search/?ef_id=_k_CjwKCAiA-P-rBhBEEiwAQEXhH5OHAJLhzzcNsuxwpa5c9EJFcuAjeh6EvZw4afirjbWXXWkiZXmU2hoC5GoQAvD_BwE_k_&OCID=AIDcmm5edswduu_SEM__k_CjwKCAiA-P-rBhBEEiwAQEXhH5OHAJLhzzcNsuxwpa5c9EJFcuAjeh6EvZw4afirjbWXXWkiZXmU2hoC5GoQAvD_BwE_k_&gad_source=1&gclid=CjwKCAiA-P-rBhBEEiwAQEXhH5OHAJLhzzcNsuxwpa5c9EJFcuAjeh6EvZw4afirjbWXXWkiZXmU2hoC5GoQAvD_BwE)\n",
			
 
				     "* Take a quick look on what is the [Azure AI Studio](https://learn.microsoft.com/en-us/azure/ai-studio/what-is-ai-studio?tabs=home) and navigate to the website from the link in the article\n",
			
 
				     "* Follow the demos in the article to create a project and [resource](https://learn.microsoft.com/en-us/azure/azure-resource-manager/management/manage-resource-groups-portal) group, or you can also follow the guide [here](https://learn.microsoft.com/en-us/azure/ai-studio/how-to/deploy-models-llama?tabs=azure-studio)\n",
			
 
				-    "* Select Llama models from Model catalog\n",
			
 
				-    "* Deploy with \"Pay-as-you-go\"\n",
			
 
				-    "\n",
			
 
				-    "Once deployed successfully, you should be assigned for an API endpoint and a security key for inference.  \n",
			
 
				+    "* For Llama 3 instruct models from Model catalog, click Deploy in the model page and select \"Pay-as-you-go\". Once deployed successfully, you should be assigned for an API endpoint and a security key for inference.\n",
			
 
				+    "* For Llama 3 pretrained models, Azure currently only support manual deployment under regular subscription. We are working with them to bring \"Pay-as-you-go\" for pretrained models.\n",
			
 
				     "\n",
			
 
				     "For more information, you should consult Azure's official documentation [here](https://learn.microsoft.com/en-us/azure/ai-studio/how-to/deploy-models-llama?tabs=azure-studio) for model deployment and inference."
			
 
				    ]
			
@@ -41,10 +40,12 @@
 
				     "\n",
			
 
				     "### Basics\n",
			
 
				     "\n",
			
 
				+    "The usage and schema of the API are identical to Llama 3 API hosted on Azure.\n",
			
 
				+    "\n",
			
 
				     "For using the REST API, You will need to have an Endpoint url and Authentication Key associated with that endpoint.  \n",
			
 
				     "This can be acquired from previous steps.  \n",
			
 
				     "\n",
			
 
				-    "In this text completion example for pre-trained model, we use a simple curl call for illustration. There are three major components:  \n",
			
 
				+    "In this chat completion example for instruct model, we use a simple curl call for illustration. There are three major components:  \n",
			
 
				     "\n",
			
 
				     "* The `host-url` is your endpoint url with completion schema. \n",
			
 
				     "* The `headers` defines the content type as well as your api key. \n",
			
@@ -52,20 +53,9 @@
 
				    ]
			
 
				   },
			
 
				   {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "!curl -X POST -L https://your-endpoint.inference.ai.azure.com/v1/completions -H 'Content-Type: application/json' -H 'Authorization: your-auth-key' -d '{\"prompt\": \"Math is a\", \"max_tokens\": 30, \"temperature\": 0.7}' "
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				    "cell_type": "markdown",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				-    "For chat completion, the API schema and request payload are slightly different.\n",
			
 
				-    "\n",
			
 
				     "The `host-url` needs to be `/v1/chat/completions` and the request payload to include roles in conversations. Here is a sample payload:  \n",
			
 
				     "\n",
			
 
				     "```\n",
			
@@ -100,18 +90,6 @@
 
				    "cell_type": "markdown",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				-    "If you compare the generation result for both text and chat completion API calls, you will notice that:  \n",
			
 
				-    "\n",
			
 
				-    "* Text completion returns a list of `choices` for the input prompt, each contains generated text and completion information such as `logprobs`.\n",
			
 
				-    "* Chat completion returns a list of `choices` each with a `message` object with completion result, matching the `messages` object in the request.  \n",
			
 
				-    "\n",
			
 
				-    "\n"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				     "### Streaming\n",
			
 
				     "\n",
			
 
				     "One fantastic feature the API offers is the streaming capability.  \n",
			
@@ -147,7 +125,7 @@
 
				    "source": [
			
 
				     "### Content Safety Filtering\n",
			
 
				     "\n",
			
 
				-    "All Azure Llama 2 API endpoints have content safety feature turned on. Both input prompt and output tokens are filtered by this service automatically.  \n",
			
 
				+    "All Azure Llama 3 API endpoints have content safety feature turned on. Both input prompt and output tokens are filtered by this service automatically.  \n",
			
 
				     "To know more about the impact to the request/response payload, please refer to official guide [here](https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/content-filter?tabs=python).   \n",
			
 
				     "\n",
			
 
				     "For model input and output, if the filter detects there is harmful content, the generation will error out with a response payload containing the reasoning, along with information on the type of content violation and its severity. \n",
			
@@ -172,7 +150,7 @@
 
				     "\n",
			
 
				     "Besides calling the API directly from command line tools, you can also programatically call them in Python.  \n",
			
 
				     "\n",
			
 
				-    "Here is an example for the text completion model:\n",
			
 
				+    "Here is an example for the instruct model:\n",
			
 
				     "\n",
			
 
				     "\n"
			
 
				    ]
			
@@ -187,53 +165,6 @@
 
				     "import json\n",
			
 
				     "\n",
			
 
				     "#Configure payload data sending to API endpoint\n",
			
 
				-    "data = {\"prompt\": \"Math is a\", \n",
			
 
				-    "         \"max_tokens\": 30, \n",
			
 
				-    "         \"temperature\": 0.7,\n",
			
 
				-    "         \"top_p\": 0.9,      \n",
			
 
				-    "}\n",
			
 
				-    "\n",
			
 
				-    "body = str.encode(json.dumps(data))\n",
			
 
				-    "\n",
			
 
				-    "#Replace the url with your API endpoint\n",
			
 
				-    "url = 'https://your-endpoint.inference.ai.azure.com/v1/completions'\n",
			
 
				-    "\n",
			
 
				-    "#Replace this with the key for the endpoint\n",
			
 
				-    "api_key = 'your-auth-key'\n",
			
 
				-    "if not api_key:\n",
			
 
				-    "    raise Exception(\"API Key is missing\")\n",
			
 
				-    "\n",
			
 
				-    "headers = {'Content-Type':'application/json', 'Authorization':(api_key)}\n",
			
 
				-    "req = urllib.request.Request(url, body, headers)\n",
			
 
				-    "\n",
			
 
				-    "try:\n",
			
 
				-    "    response = urllib.request.urlopen(req)\n",
			
 
				-    "    result = response.read()\n",
			
 
				-    "    print(result)\n",
			
 
				-    "except urllib.error.HTTPError as error:\n",
			
 
				-    "    print(\"The request failed with status code: \" + str(error.code))\n",
			
 
				-    "    # Print the headers - they include the requert ID and the timestamp, which are useful for debugging the failure\n",
			
 
				-    "    print(error.info())\n",
			
 
				-    "    print(error.read().decode(\"utf8\", 'ignore'))\n"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "Chat completion in Python is very similar, here is a quick example:"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "import urllib.request\n",
			
 
				-    "import json\n",
			
 
				-    "\n",
			
 
				-    "#Configure payload data sending to API endpoint\n",
			
 
				     "data = {\"messages\":[\n",
			
 
				     "            {\"role\":\"system\", \"content\":\"You are a helpful assistant.\"},\n",
			
 
				     "            {\"role\":\"user\", \"content\":\"Who wrote the book Innovators dilemma?\"}], \n",
			
@@ -323,14 +254,12 @@
 
				    "cell_type": "markdown",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				-    "## Use Llama 2 API with LangChain\n",
			
 
				+    "## Use Llama 3 API with LangChain\n",
			
 
				     "\n",
			
 
				-    "In this section, we will demonstrate how to use Llama 2 APIs with LangChain, one of the most popular framework to accelerate building your AI product.  \n",
			
 
				+    "In this section, we will demonstrate how to use Llama 3 APIs with LangChain, one of the most popular framework to accelerate building your AI product.  \n",
			
 
				     "One common solution here is to create your customized LLM instance, so you can add it to various chains to complete different tasks.  \n",
			
 
				     "In this example, we will use the `AzureMLOnlineEndpoint` class LangChain provides to build a customized LLM instance. This particular class is designed to take in Azure endpoint and API keys as inputs and wire it with HTTP calls. So the underlying of it is very similar to how we used `urllib.request` library to send RESTful calls in previous examples to the Azure Endpoint.   \n",
			
 
				     "\n",
			
 
				-    "Note Azure is working on a standard solution for LangChain integration in this [PR](https://github.com/langchain-ai/langchain/pull/14560), you should consider migrating to that in the future. \n",
			
 
				-    "\n",
			
 
				     "First, let's install dependencies: \n",
			
 
				     "\n"
			
 
				    ]
			
@@ -363,7 +292,7 @@
 
				     "\n",
			
 
				     "\n",
			
 
				     "class AzureLlamaAPIContentFormatter(ContentFormatterBase):\n",
			
 
				-    "#Content formatter for Llama 2 API for Azure MaaS\n",
			
 
				+    "#Content formatter for Llama 3 API for Azure MaaS\n",
			
 
				     "\n",
			
 
				     "    def format_request_payload(self, prompt: str, model_kwargs: Dict) -> bytes:\n",
			
 
				     "        #Formats the request according to the chosen api\n",
			
@@ -450,18 +379,11 @@
 
				    "cell_type": "markdown",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				-    "At the time of writing this sample notebook, LangChain doesn't support streaming with `AzureMLOnlineEndpoint` for Llama 2. We are working with LangChain and Azure team to implement that."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "## Build a chatbot with Llama 2 API\n",
			
 
				+    "## Build a chatbot with Llama 3 API\n",
			
 
				     "\n",
			
 
				-    "In this section, we will build a simple chatbot using Azure Llama 2 API, LangChain and [Gradio](https://www.gradio.app/)'s `ChatInterface` with memory capability.\n",
			
 
				+    "In this section, we will build a simple chatbot using Azure Llama 3 API, LangChain and [Gradio](https://www.gradio.app/)'s `ChatInterface` with memory capability.\n",
			
 
				     "\n",
			
 
				-    "Gradio is a framework to help demo your machine learning model with a web interface. We also have a dedicated Gradio chatbot [example](https://github.com/meta-llama/llama-recipes/blob/main/recipes/use_cases/chatbots/RAG_chatbot/RAG_Chatbot_Example.ipynb) built with Llama 2 on-premises with RAG.   \n",
			
 
				+    "Gradio is a framework to help demo your machine learning model with a web interface. We also have a dedicated Gradio chatbot [example](https://github.com/meta-llama/llama-recipes/blob/main/recipes/use_cases/chatbots/RAG_chatbot/RAG_Chatbot_Example.ipynb) built with Llama 3 on-premises with RAG.   \n",
			
 
				     "\n",
			
 
				     "First, let's install Gradio dependencies.\n"
			
 
				    ]
			
@@ -508,7 +430,7 @@
 
				     "langchain.debug=True\n",
			
 
				     "\n",
			
 
				     "class AzureLlamaAPIContentFormatter(ContentFormatterBase):\n",
			
 
				-    "#Content formatter for Llama 2 API for Azure MaaS\n",
			
 
				+    "#Content formatter for Llama 3 API for Azure MaaS\n",
			
 
				     "\n",
			
 
				     "    def format_request_payload(self, prompt: str, model_kwargs: Dict) -> bytes:\n",
			
 
				     "        #Formats the request according to the chosen api\n",
			
@@ -602,7 +524,7 @@
 
				    "name": "python",
			
 
				    "nbconvert_exporter": "python",
			
 
				    "pygments_lexer": "ipython3",
			
 
				-   "version": "3.10.10"
			
 
				+   "version": "3.9.6"
			
 
				   }
			
 
				  },
			
 
				  "nbformat": 4,
			
--- a/recipes/use_cases/MediaGen.ipynb
+++ b/recipes/use_cases/MediaGen.ipynb
--- a/recipes/use_cases/README.md
+++ b/recipes/use_cases/README.md
@@ -14,4 +14,10 @@ This step-by-step tutorial shows how to use the [WhatsApp Business API](https://
 
				 This step-by-step tutorial shows how to use the [Messenger Platform](https://developers.facebook.com/docs/messenger-platform/overview) to build a Llama 3 enabled Messenger chatbot.
			
 
				 
			
 
				 ### RAG Chatbot Example (running [locally](./chatbots/RAG_chatbot/RAG_Chatbot_Example.ipynb) or on [OctoAI](../llama_api_providers/OctoAI_API_examples/RAG_Chatbot_example/RAG_Chatbot_Example.ipynb))
			
 
				-A complete example of how to build a Llama 3 chatbot hosted on your browser that can answer questions based on your own data using retrieval augmented generation (RAG). You can run Llama2 locally if you have a good enough GPU or on OctoAI if you follow the note [here](../README.md#octoai_note).
			
 
				+A complete example of how to build a Llama 3 chatbot hosted on your browser that can answer questions based on your own data using retrieval augmented generation (RAG). You can run Llama2 locally if you have a good enough GPU or on OctoAI if you follow the note [here](../README.md#octoai_note).
			
 
				+
			
 
				+## [Sales Bot](./chatbots/sales_bot/SalesBot.ipynb): Sales Bot with Llama3 - A Summarization and RAG Use Case
			
 
				+An summarization + RAG use case built around the Amazon product review Kaggle dataset to build a helpful Music Store Sales Bot. The summarization and RAG are built on top of Llama models hosted on OctoAI, and the vector database is hosted on Weaviate Cloud Services.
			
 
				+
			
 
				+## [Media Generation](./MediaGen.ipynb): Building a Video Generation Pipeline with Llama3
			
 
				+This step-by-step tutorial shows how to use leverage Llama 3 to drive the generation of animated videos using SDXL and SVD. More specifically it relies on JSON formatting to produce a scene-by-scene story board of a recipe video. The user provides the name of a dish, then Llama 3 describes a step by step guide to reproduce the said dish. This step by step guide is brought to life with models like SDXL and SVD.
			
--- a/recipes/use_cases/chatbots/sales_bot/Musical_instruments_reviews.csv
+++ b/recipes/use_cases/chatbots/sales_bot/Musical_instruments_reviews.csv
--- a/recipes/use_cases/chatbots/sales_bot/SalesBot.ipynb
+++ b/recipes/use_cases/chatbots/sales_bot/SalesBot.ipynb
@@ -0,0 +1,668 @@
 
				+{
			
 
				+ "cells": [
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "374b67d0-b446-4d6f-8e07-59e97716c55a",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "# Sales Bot with Llama3 - A Summarization and RAG Use Case"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "add4953d-07c3-4480-ad91-7d0ea9c9fb55",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "## Overview\n",
			
 
				+    "\n",
			
 
				+    "In this notebook you'll take an Amazon product reviews dataset from Kaggle and use Llama3 to obtain product review summaries, upsert those summaries in a vector database, then use Retrieval Augmented Generation (RAG) to power a sales chatbot that can make targeted product recommendations.\n",
			
 
				+    "\n",
			
 
				+    "Let's take a look at the overall workflow:\n",
			
 
				+    "1. We start with a dataset that contains over 10,000 reviews across 900 Amazon musical instruments and accessories.\n",
			
 
				+    "2. Using Llama2 70B chat (hosted on OctoAI), we generate summaries of product reviews for each product from the 20 most recent reviews. We format the summaries in JSON format.\n",
			
 
				+    "3. We then take the summaries and upsert them into a vector database (Weaviate in this case)\n",
			
 
				+    "4. We then use this vector database and Llama3 70B instruct (hosted on OctoAI) to build a RAG-based sales chatbot that provides targeted recommendations to the user based on the products that are present in the inventory.\n",
			
 
				+    "\n",
			
 
				+    "Note: at the time of writing this tutorial, JSON mode formatting isn't supported for Llama 3 on OctoAI via constrained sampling which is why we are falling back onto Llama 2. This tutorial will be updated when the feature becomes available to rely on Llama 3 exclusively.\n",
			
 
				+    "\n",
			
 
				+    "### OctoAI\n",
			
 
				+    "We'll use [OctoAI](https://octo.ai/) to power all of the GenAI model needs of this notebook: LLMs, image gen, image animation.\n",
			
 
				+    "* To use OctoAI, you'll need to go to https://octoai.cloud/ and sign in using your Google or GitHub account.\n",
			
 
				+    "* Next you'll need to generate an OctoAI API token by following these [instructions](https://octo.ai/docs/getting-started/how-to-create-an-octoai-access-token). Keep the API token in hand, we'll need it further down in this notebook.\n",
			
 
				+    "\n",
			
 
				+    "In this example we will use the Llama 3 70b instruct model. You can find more on Llama models on the [OctoAI text generation solution page](https://octoai.cloud/text).\n",
			
 
				+    "\n",
			
 
				+    "At the time of writing this notebook the following Llama models are available on OctoAI:\n",
			
 
				+    "* meta-llama-3-8b-instruct\n",
			
 
				+    "* meta-llama-3-70b-instruct\n",
			
 
				+    "* codellama-7b-instruct\n",
			
 
				+    "* codellama-13b-instruct\n",
			
 
				+    "* codellama-34b-instruct\n",
			
 
				+    "* llama-2-13b-chat\n",
			
 
				+    "* llama-2-70b-chat\n",
			
 
				+    "* llamaguard-7b\n",
			
 
				+    "\n",
			
 
				+    "### Weaviate\n",
			
 
				+    "We'll use Weaviate Cloud Services (WCS) for our vector database. You can create an account and Weaviate clusters easily at the following link: https://console.weaviate.cloud/.\n",
			
 
				+    "You can then create a cluster, from which you can obtain the REST Endpoint URL and the API key to use the cluster endpoint.\n",
			
 
				+    "\n",
			
 
				+    "### OpenAI\n",
			
 
				+    "We'll be using OpenAI for its embedding model to upsert our vectors into the Weaviate vector database. Create an account and obtain an API key here: https://openai.com/api/\n",
			
 
				+    "\n",
			
 
				+    "### Local Python Notebook\n",
			
 
				+    "We highly recommend launching this notebook from a fresh python environment, for instance you can run the following:\n",
			
 
				+    "```\n",
			
 
				+    "python3 -m venv .venv         \n",
			
 
				+    "source .venv/bin/activate\n",
			
 
				+    "```\n",
			
 
				+    "All you need to run this notebook is to install jupyter notebook with `python3 -m pip install notebook` then run `jupyter notebook` ([link](https://jupyter.org/install)) in the same directory as this `.ipynb` file.\n",
			
 
				+    "You don't need to install additional pip packages ahead of running the notebook, since those will be installed right at the beginning. You will need to ensure your system has `imagemagick` installed by following the [instructions](https://imagemagick.org/script/download.php)."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "133c2ea4-0256-49cf-9f5a-a9e5bb0bb63f",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# Let's start by installing the appropriate python packages\n",
			
 
				+    "! pip install octoai===1.0.2 openai weaviate-client pandas gradio pydantic"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "75341227-43f8-4a68-b3cb-31e8216f874e",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "## Part 1: Review Summarization"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "793c06d7-fa67-4c67-a380-081ed3a7a7bf",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "Let's start by importing all of the packages we need for this example"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "edd366c8-4f0b-4211-83d3-c16e88cbd5c8",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "import gradio\n",
			
 
				+    "import json\n",
			
 
				+    "import langchain\n",
			
 
				+    "import os\n",
			
 
				+    "import openai\n",
			
 
				+    "import weaviate\n",
			
 
				+    "from getpass import getpass\n",
			
 
				+    "from json import loads\n",
			
 
				+    "from pandas import DataFrame, concat, read_csv\n",
			
 
				+    "from pydantic import BaseModel, Field\n",
			
 
				+    "from typing import List\n",
			
 
				+    "import weaviate.classes as wvc"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "cd171a7c-c5e7-46d5-8a04-a0f7863609be",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "Enter your OctoAI, Weaviate, and OpenAI tokens below"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "3af09686-a654-45b0-98c5-dee6f30440c7",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# Get OctoAI API token for Llama 2 & 3\n",
			
 
				+    "OCTOAI_API_TOKEN = getpass()\n",
			
 
				+    "os.environ[\"OCTOAI_API_TOKEN\"] = OCTOAI_API_TOKEN"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "31c3e684-6e5e-41ad-81d4-970b06522553",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# Get WCS API key\n",
			
 
				+    "WCS_API_KEY = getpass()\n",
			
 
				+    "os.environ[\"WCS_API_KEY\"] = WCS_API_KEY"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "a44f7b71-c4f9-4fd6-9a3b-1322c2fd0c35",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# Get WCS URL\n",
			
 
				+    "WCS_URL = getpass()\n",
			
 
				+    "os.environ[\"WCS_URL\"] = WCS_URL"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "e4502dfa-c369-4085-a697-fdcda00f970b",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# Get OpenAI API key for the embedding model\n",
			
 
				+    "OPENAI_API_KEY = getpass()\n",
			
 
				+    "os.environ[\"OPENAI_API_KEY\"] = OPENAI_API_KEY"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "883986ad-9f60-44d8-ab64-3f566261e055",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# First let's load the dataset from Kaggle: https://www.kaggle.com/datasets/eswarchandt/amazon-music-reviews\n",
			
 
				+    "df = read_csv('Musical_instruments_reviews.csv')"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "c05865a7-307a-425e-a6ee-f057d63db77b",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "Set `product_record_limit` to a lower number if you just want to do a test run"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "22f024e7-3976-425f-b684-8b2c2c1ed191",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# Set a product record limit\n",
			
 
				+    "product_record_limit = 900"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "06554f51-5983-42fc-8a8e-684ae82099db",
			
 
				+   "metadata": {
			
 
				+    "scrolled": true
			
 
				+   },
			
 
				+   "source": [
			
 
				+    "# List all of the unique ASIN:\n",
			
 
				+    "asin_list = df.asin.unique()\n",
			
 
				+    "print(\"There are {} unique products in the music product inventory\".format(len(asin_list)))"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "4941baa1-107b-4f39-8d04-1daa5acd465b",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "For each one of the unique products, let's group the reviews together and sort them by how recent they are"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "38147b91-2425-46a7-b6c0-221173d81024",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# Get the reviews for the product ASIN, sorted by recency and store in dict\n",
			
 
				+    "review_dict = {}\n",
			
 
				+    "for asin in asin_list[0:product_record_limit]:\n",
			
 
				+    "    reviews = df.loc[df['asin'] == asin]\\\n",
			
 
				+    "                .sort_values([\"unixReviewTime\"], axis=0, ascending=False)\\\n",
			
 
				+    "                .reviewText.tolist()\n",
			
 
				+    "    review_dict[asin] = reviews"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "7d5fb78d-808a-4753-abba-4a3066d76ba7",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "To be able to store our summaries into our vector DB, we need to have the fields formatted into a JSON object. We use Pydantic base class model here to define our formatting."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "b786cde1-116a-47eb-8478-3fa2285dcf9d",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# Define the Pydantic model that specifies how our output should be formatted\n",
			
 
				+    "class ProductRecord(BaseModel):\n",
			
 
				+    "    \"\"\"The record of a given product\"\"\"\n",
			
 
				+    "    description: str = Field(description=\"Description of the product\")\n",
			
 
				+    "    name: str = Field(description=\"Name of the product\")\n",
			
 
				+    "    review_summary: str = Field(description=\"Summary of all of the reviews\")\n",
			
 
				+    "    ASIN: str = Field(description=\"ASIN of the product\")\n",
			
 
				+    "    features: str = Field(description=\"Features of the product based on the reviews\")"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "08226a6e-f994-454b-9a1d-6246b34bfca2",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "We define our prompt template below."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "1cc3fe69-bf0c-4a50-8d9c-1ae6cb99a9ca",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# Prepare a prompt template\n",
			
 
				+    "template = '''\n",
			
 
				+    "Here are product reviews for a music product with an ID of {asin}.\n",
			
 
				+    " - Respond back only as only JSON!\n",
			
 
				+    " - Provide:\n",
			
 
				+    "     - the product \"description\",\n",
			
 
				+    "     - the product \"name\",\n",
			
 
				+    "     - a summary of all the reviews as \"review_summary\",\n",
			
 
				+    "     - the \"ASIN\" and\n",
			
 
				+    "     - and the product \"features\" based on the content of these reviews. \n",
			
 
				+    " - The \"features\" should be a string describing the features and NOT JSON. \n",
			
 
				+    " - Do not include the ASIN in the description field.\n",
			
 
				+    " \n",
			
 
				+    "The reviews for the product are: {reviews}\n",
			
 
				+    "'''"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "9b8dc3fa-4ad9-4329-96a0-353b05a1c43e",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "We initialize the OctoAI client using OpenAI's API. All we have to do is override the `base_url` and `api_key`."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "57c2ff0a-8029-41a6-a06f-41e560b92230",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# Init OctoAI client\n",
			
 
				+    "client = openai.OpenAI(\n",
			
 
				+    "    base_url=\"https://text.octoai.run/v1\",\n",
			
 
				+    "    api_key=os.environ[\"OCTOAI_API_TOKEN\"]\n",
			
 
				+    ")"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "bd0eb425-ceea-4258-a52d-814b7335febb",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "Iterate over all product ASINs and summarize the top 20 most recent reviews. Note: this takes a while to run unless we parallelize it."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "1a55839e-a824-4919-b755-730eaac48d83",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# Produce the 900 product summaries\n",
			
 
				+    "review_summaries = []\n",
			
 
				+    "counter = 0\n",
			
 
				+    "\n",
			
 
				+    "# This can take a while to process serially (30min+)\n",
			
 
				+    "# TODO: Optimize to run in a few parallel threads to run faster while meeting the 240RPM limit\n",
			
 
				+    "for asin, review_list in review_dict.items():\n",
			
 
				+    "    print(f'Getting review summary {counter} of {len(review_dict)}, ASIN: {asin}')\n",
			
 
				+    "    try:\n",
			
 
				+    "        response = client.chat.completions.create(\n",
			
 
				+    "            model=\"llama-2-70b-chat\",\n",
			
 
				+    "            messages=[\n",
			
 
				+    "                {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
			
 
				+    "                {\"role\": \"user\", \"content\": template.format(\n",
			
 
				+    "                    asin = asin,\n",
			
 
				+    "                    reviews = review_list[0:20]\n",
			
 
				+    "                )},\n",
			
 
				+    "            ],\n",
			
 
				+    "            temperature=0,\n",
			
 
				+    "            response_format={\"type\": \"json_object\", \"schema\": ProductRecord.model_json_schema()},\n",
			
 
				+    "            max_tokens=1024\n",
			
 
				+    "        )\n",
			
 
				+    "        print(\"\\n{}\\n\".format(response.choices[0].message.content))\n",
			
 
				+    "        summary = loads(response.choices[0].message.content)\n",
			
 
				+    "        summary[\"ASIN\"] = asin\n",
			
 
				+    "        review_summaries.append(summary)\n",
			
 
				+    "    except:\n",
			
 
				+    "        print(f'Issue with ASIN {asin}, skipping')\n",
			
 
				+    "        pass\n",
			
 
				+    "    counter += 1\n",
			
 
				+    "\n",
			
 
				+    "review_summaries = DataFrame(review_summaries)\n",
			
 
				+    "\n",
			
 
				+    "print(review_summaries)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "4772d1c1-c9c4-466e-9c80-259804a4286b",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "# Part 2: Retrieval Augmented Generation"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "ccd97408-d47f-46f3-b601-f66f8a3b20ff",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "For our RAG use case we're going to rely on Weaviate vector database and on an OpenAI embedding model. \n",
			
 
				+    "\n",
			
 
				+    "When you define your collection, you'll need to provide properties, i.e. object attributes that you want to store in the collection. These properties map 1:1 to the JSON dictionary keys defined earlier for the `ProductRecord` Pydantic base model."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "5dad98ec-531d-4fc2-aed9-9f337b957feb",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# Connect to WCS\n",
			
 
				+    "wcs_client = weaviate.connect_to_wcs(\n",
			
 
				+    "    cluster_url=os.getenv(\"WCS_URL\"),\n",
			
 
				+    "    auth_credentials=weaviate.auth.AuthApiKey(os.getenv(\"WCS_API_KEY\")),\n",
			
 
				+    "    headers={\n",
			
 
				+    "        \"X-OpenAI-Api-Key\": os.environ[\"OPENAI_API_KEY\"]\n",
			
 
				+    "    }\n",
			
 
				+    ")"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "02953f7b-0149-4c13-a7cc-c4dd1da45d43",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# Create the collection if it doesn't already exist\n",
			
 
				+    "try:\n",
			
 
				+    "    collection = wcs_client.collections.get(\"Products\")\n",
			
 
				+    "except:\n",
			
 
				+    "    # Create the collection for products\n",
			
 
				+    "    collection = wcs_client.collections.create(\n",
			
 
				+    "        name=\"Products\",\n",
			
 
				+    "        vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_openai(),\n",
			
 
				+    "        properties=[\n",
			
 
				+    "            wvc.config.Property(\n",
			
 
				+    "                name=\"ASIN\",\n",
			
 
				+    "                data_type=wvc.config.DataType.TEXT\n",
			
 
				+    "            ),\n",
			
 
				+    "             wvc.config.Property(\n",
			
 
				+    "                name=\"name\",\n",
			
 
				+    "                data_type=wvc.config.DataType.TEXT\n",
			
 
				+    "            ),\n",
			
 
				+    "             wvc.config.Property(\n",
			
 
				+    "                name=\"review_summary\",\n",
			
 
				+    "                data_type=wvc.config.DataType.TEXT\n",
			
 
				+    "            ),\n",
			
 
				+    "             wvc.config.Property(\n",
			
 
				+    "                name=\"features\",\n",
			
 
				+    "                data_type=wvc.config.DataType.TEXT\n",
			
 
				+    "            ),\n",
			
 
				+    "             wvc.config.Property(\n",
			
 
				+    "                name=\"description\",\n",
			
 
				+    "                data_type=wvc.config.DataType.TEXT\n",
			
 
				+    "            ),\n",
			
 
				+    "        ]\n",
			
 
				+    "    )\n",
			
 
				+    "    print(\"Collection Created!\")"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "1551fd74-b143-4c02-9b56-364d33683fd3",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "Now we upsert all of the vectors into the databse using OpenAI's embedding model."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "53f779e7-b875-4a19-9f9c-74b45992608e",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# Convert df to JSON string and then to a list of dictionaries\n",
			
 
				+    "data = review_summaries.to_json(orient='records')\n",
			
 
				+    "data_list = json.loads(data)\n",
			
 
				+    "\n",
			
 
				+    "items_to_insert = []\n",
			
 
				+    "\n",
			
 
				+    "for d in data_list:\n",
			
 
				+    "    new_item = {\n",
			
 
				+    "        \"ASIN\": d[\"ASIN\"],\n",
			
 
				+    "        \"name\": d[\"name\"],\n",
			
 
				+    "        \"description\": d[\"description\"],  \\\n",
			
 
				+    "        \"features\": d[\"features\"],\n",
			
 
				+    "        \"review_summary\": d[\"review_summary\"]\n",
			
 
				+    "    }\n",
			
 
				+    "    items_to_insert.append(new_item)\n",
			
 
				+    "\n",
			
 
				+    "    # Insert every 100 items\n",
			
 
				+    "    if len(items_to_insert) == 100:\n",
			
 
				+    "        collection.data.insert_many(items_to_insert)\n",
			
 
				+    "        items_to_insert.clear()\n",
			
 
				+    "\n",
			
 
				+    "# Insert remaining items\n",
			
 
				+    "if len(items_to_insert) > 0:\n",
			
 
				+    "    collection.data.insert_many(items_to_insert)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "35079318-41a5-46fc-8475-5d728550fb88",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "Let's now try to run a hybrid search on the following query below.\n",
			
 
				+    "Hybrid search combines the results of a vector search and a keyword (BM25F) search by fusing the two result sets.\n",
			
 
				+    "It will return the 3 closest entries in the database according to the search criteria."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "5f707954-c36b-4a83-874b-f817bd33c39a",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# Hybrid search\n",
			
 
				+    "response = collection.query.hybrid(\n",
			
 
				+    "    query=\"easy to learn instrument\",\n",
			
 
				+    "    limit=3\n",
			
 
				+    ")\n",
			
 
				+    "for o in response.objects:\n",
			
 
				+    "    print(o.properties)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "04d39507-5e8e-4374-a33c-53e57db6ef99",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "Let's now define a helper function that gives us the relevant context given a string query. Let's see what it returns based on the question: \"What is a good beginner harmonica\""
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "a1ca51c7-83e5-4896-acc9-753060592ba0",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# Helper function to run hybrid search on a user query and return the closest\n",
			
 
				+    "# product review summaries relevant to the user query\n",
			
 
				+    "def get_context(question, limit=3):\n",
			
 
				+    "    response = collection.query.hybrid(\n",
			
 
				+    "        query=question,\n",
			
 
				+    "        limit=limit\n",
			
 
				+    "    )\n",
			
 
				+    "    return \"\\n\".join([str(o.properties) for o in response.objects])\n",
			
 
				+    "\n",
			
 
				+    "print(get_context(\"What is a good beginner harmonica\"))"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "677f534c-8be4-4b6b-82d9-2df8e2ad12d4",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "Great, we're now ready to build a sales assistant helper function.\n",
			
 
				+    "\n",
			
 
				+    "We first define a prompt template for Llama 3 - based on the context provided by the vector hybrid search (i.e. collection of product summaries of relevance to the question), provide a helpful recommendation to the customer. \n",
			
 
				+    "\n",
			
 
				+    "Also provide links to the product that the user can click on to view the product on Amazon's website. For that we use the fact that any product referenced by its aSIN can be accessed at the following url: `https://www.amazon.com/exec/obidos/ASIN/<insert aSIN here>`"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "856d021a-add5-48f4-a09c-258d2a617095",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "sales_template = \"\"\"\n",
			
 
				+    "You are a sales assistant. Answer the user questions as helpfully as possible.\n",
			
 
				+    "Only recommend the products that are provided in the context provided below.\n",
			
 
				+    "\n",
			
 
				+    "Provide a reference to each product you mention with hyperlinks:\n",
			
 
				+    "* Provide the name of the product\n",
			
 
				+    "* Embed the hyperlink in the name of the product as follows\n",
			
 
				+    "    * If the product name is \"Solid Electric Guitar Case with Accessories Compartment\"\n",
			
 
				+    "    * And the aSIN is \"B001EL6I8W\"\n",
			
 
				+    "    * Format the reference as follows: \n",
			
 
				+    "         [Solid Electric Guitar Case with Accessories Compartment](https://www.amazon.com/exec/obidos/ASIN/B001EL6I8W)\n",
			
 
				+    "\n",
			
 
				+    "Finish with a references section.\n",
			
 
				+    "\n",
			
 
				+    "Customer question: {}\n",
			
 
				+    "\n",
			
 
				+    "Product context: {}\n",
			
 
				+    "\n",
			
 
				+    "AI:\n",
			
 
				+    "\"\"\"\n",
			
 
				+    "\n",
			
 
				+    "def sales_assistant(question):  \n",
			
 
				+    "    response = client.chat.completions.create(\n",
			
 
				+    "                model=\"meta-llama-3-70b-instruct\",\n",
			
 
				+    "                messages=[\n",
			
 
				+    "                    {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
			
 
				+    "                    {\"role\": \"user\", \"content\": sales_template.format(question, get_context(question, limit=10))},\n",
			
 
				+    "                ],\n",
			
 
				+    "                temperature=0,\n",
			
 
				+    "                max_tokens=1024\n",
			
 
				+    "            )\n",
			
 
				+    "    \n",
			
 
				+    "    return response.choices[0].message.content\n",
			
 
				+    "\n",
			
 
				+    "print(sales_assistant(\"what is must have accessory for my new electric guitar\"))"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "faccba14-9216-4420-b6c5-ddf4029d7904",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "# Part 3: Gradio-based sales assistant demo"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "3e2b73b5-6bdf-4c87-b044-2690fd52605f",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "In this section we build a simple an interactive sales bot assistant using Gradio."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "53805acb-3e8d-40fa-8045-c589cb14eadd",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "import gradio as gr\n",
			
 
				+    "\n",
			
 
				+    "def predict(message, history):\n",
			
 
				+    "    history_openai_format = []\n",
			
 
				+    "    for human, assistant in history:\n",
			
 
				+    "        history_openai_format.append({\"role\": \"user\", \"content\": human})\n",
			
 
				+    "        history_openai_format.append({\"role\": \"assistant\", \"content\": assistant})\n",
			
 
				+    "    history_openai_format.append({\"role\": \"user\", \"content\": sales_template.format(message, get_context(message, limit=5))})\n",
			
 
				+    "\n",
			
 
				+    "    response = client.chat.completions.create(\n",
			
 
				+    "        model = 'meta-llama-3-70b-instruct',\n",
			
 
				+    "        messages = history_openai_format,\n",
			
 
				+    "        temperature = 0.0,\n",
			
 
				+    "        stream = True\n",
			
 
				+    "     )\n",
			
 
				+    "\n",
			
 
				+    "    partial_message = \"\"\n",
			
 
				+    "    for chunk in response:\n",
			
 
				+    "        if chunk.choices[0].delta.content is not None:\n",
			
 
				+    "              partial_message = partial_message + chunk.choices[0].delta.content\n",
			
 
				+    "              yield partial_message\n",
			
 
				+    "\n",
			
 
				+    "gr.ChatInterface(predict).launch()"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "6d4e65fe-0246-40b7-adb6-9091cccbc486",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "**Authors**\n",
			
 
				+    "- Thierry Moreau, OctoAI - tmoreau@octo.ai\n",
			
 
				+    "- Jonathan Tuite, Weaviate - jon@weaviate.io"
			
 
				+   ]
			
 
				+  }
			
 
				+ ],
			
 
				+ "metadata": {
			
 
				+  "kernelspec": {
			
 
				+   "display_name": "Python 3 (ipykernel)",
			
 
				+   "language": "python",
			
 
				+   "name": "python3"
			
 
				+  },
			
 
				+  "language_info": {
			
 
				+   "codemirror_mode": {
			
 
				+    "name": "ipython",
			
 
				+    "version": 3
			
 
				+   },
			
 
				+   "file_extension": ".py",
			
 
				+   "mimetype": "text/x-python",
			
 
				+   "name": "python",
			
 
				+   "nbconvert_exporter": "python",
			
 
				+   "pygments_lexer": "ipython3",
			
 
				+   "version": "3.11.6"
			
 
				+  }
			
 
				+ },
			
 
				+ "nbformat": 4,
			
 
				+ "nbformat_minor": 5
			
 
				+}
			
--- a/recipes/use_cases/llamaindex_cookbook.ipynb
+++ b/recipes/use_cases/llamaindex_cookbook.ipynb
--- a/scripts/spellcheck_conf/wordlist.txt
+++ b/scripts/spellcheck_conf/wordlist.txt
@@ -1317,3 +1317,9 @@ vectorstore
 
				 AgentFinish
			
 
				 ReAct
			
 
				 customizable
			
 
				+Kaggle
			
 
				+SalesBot
			
 
				+Weaviate
			
 
				+MediaGen
			
 
				+SDXL
			
 
				+SVD
			
--- a/src/llama_recipes/finetuning.py
+++ b/src/llama_recipes/finetuning.py
@@ -166,8 +166,7 @@ def main(**kwargs):
 
				     #setting up FSDP if enable_fsdp is enabled
			
 
				     if train_config.enable_fsdp:
			
 
				         if not train_config.use_peft and train_config.freeze_layers:
			
 
				-
			
 
				-            freeze_transformer_layers(train_config.num_freeze_layers)
			
 
				+            freeze_transformer_layers(model, train_config.num_freeze_layers)
			
 
				 
			
 
				         mixed_precision_policy, wrapping_policy = get_policies(fsdp_config, rank)
			
 
				         my_auto_wrapping_policy = fsdp_auto_wrap_policy(model, LlamaDecoderLayer)
			
@@ -217,7 +216,7 @@ def main(**kwargs):
 
				         split="test",
			
 
				     )
			
 
				     if not train_config.enable_fsdp or rank == 0:
			
 
				-            print(f"--> Validation Set Length = {len(dataset_val)}")
			
 
				+        print(f"--> Validation Set Length = {len(dataset_val)}")
			
 
				 
			
 
				     if train_config.batching_strategy == "packing":
			
 
				         dataset_train = ConcatDataset(dataset_train, chunk_size=train_config.context_length)
			
--- a/src/llama_recipes/utils/config_utils.py
+++ b/src/llama_recipes/utils/config_utils.py
@@ -34,7 +34,7 @@ def update_config(config, **kwargs):
 
				                     if hasattr(config, param_name):
			
 
				                         setattr(config, param_name, v)
			
 
				                     else:
			
 
				-                        # In case of specialized config we can warm user
			
 
				+                        # In case of specialized config we can warn user
			
 
				                         print(f"Warning: {config_name} does not accept parameter: {k}")
			
 
				             elif isinstance(config, train_config):
			
 
				                 print(f"Warning: unknown parameter {k}")
			
--- a/src/llama_recipes/utils/train_utils.py
+++ b/src/llama_recipes/utils/train_utils.py
@@ -103,6 +103,8 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche
 
				     val_loss =[]
			
 
				 
			
 
				     if train_config.save_metrics:
			
 
				+        if not os.path.exists(train_config.output_dir):
			
 
				+            os.makedirs(train_config.output_dir, exist_ok=True)
			
 
				         metrics_filename = f"{train_config.output_dir}/metrics_data_{local_rank}-{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json"
			
 
				         train_step_perplexity = []
			
 
				         train_step_loss = []