2 tahun lalu · 61cdf88225
--- a/.github/scripts/check_copyright_header.py
+++ b/.github/scripts/check_copyright_header.py
--- a/.github/scripts/markdown_link_check_config.json
+++ b/.github/scripts/markdown_link_check_config.json
--- a/scripts/spellcheck.sh
+++ b/scripts/spellcheck.sh
@@ -19,5 +19,5 @@ done
 
				 if [ ! "$sources_arg" ]; then
			
 
				 	echo "No files to spellcheck"
			
 
				 else
			
 
				-	pyspelling -c scripts/spellcheck_conf/spellcheck.yaml --name Markdown $sources_arg
			
 
				+	pyspelling -c .github/scripts/spellcheck_conf/spellcheck.yaml --name Markdown $sources_arg
			
 
				 fi
			
--- a/scripts/spellcheck_conf/spellcheck.yaml
+++ b/scripts/spellcheck_conf/spellcheck.yaml
@@ -5,8 +5,8 @@ matrix:
 
				     d: en_US
			
 
				   dictionary:
			
 
				     wordlists:
			
 
				-    - scripts/spellcheck_conf/wordlist.txt
			
 
				-    output: scripts/spellcheck_conf/wordlist.dic
			
 
				+    - .github/scripts/spellcheck_conf/wordlist.txt
			
 
				+    output: .github/scripts/spellcheck_conf/wordlist.dic
			
 
				     encoding: utf-8
			
 
				   pipeline:
			
 
				   - pyspelling.filters.context:
			
--- a/scripts/spellcheck_conf/wordlist.txt
+++ b/scripts/spellcheck_conf/wordlist.txt
@@ -1310,3 +1310,44 @@ leaderboards
 
				 txn
			
 
				 ollama
			
 
				 tavily
			
 
				+AgentExecutor
			
 
				+LangGraph
			
 
				+langgraph
			
 
				+vectorstore
			
 
				+CMake
			
 
				+Chipset
			
 
				+JBR
			
 
				+JNI
			
 
				+MLCChat
			
 
				+MTP
			
 
				+MacBook
			
 
				+Moreau
			
 
				+NDK
			
 
				+NDK's
			
 
				+OSX
			
 
				+OnePlus
			
 
				+OxygenOS
			
 
				+SoC
			
 
				+Sonoma
			
 
				+TVM
			
 
				+Thierry
			
 
				+Wifi
			
 
				+chipset
			
 
				+feb
			
 
				+moreau
			
 
				+octo
			
 
				+rustc
			
 
				+rustup
			
 
				+sha
			
 
				+tmoreau
			
 
				+toolchain
			
 
				+wifi
			
 
				+AgentFinish
			
 
				+ReAct
			
 
				+customizable
			
 
				+Kaggle
			
 
				+SalesBot
			
 
				+Weaviate
			
 
				+MediaGen
			
 
				+SDXL
			
 
				+SVD
			
--- a/.github/workflows/spellcheck.yml
+++ b/.github/workflows/spellcheck.yml
@@ -20,11 +20,11 @@ jobs:
 
				         uses: gaurav-nelson/github-action-markdown-link-check@1.0.13
			
 
				         with:
			
 
				           use-verbose-mode: 'yes'
			
 
				-          config-file: "scripts/markdown_link_check_config.json"
			
 
				+          config-file: ".github/scripts/markdown_link_check_config.json"
			
 
				 
			
 
				       - name: Get changed files
			
 
				         id: changed-files
			
 
				-        uses: tj-actions/changed-files@v29.0.4
			
 
				+        uses: tj-actions/changed-files@v41.0.0
			
 
				         with:
			
 
				 
			
 
				           files: |
			
@@ -42,7 +42,7 @@ jobs:
 
				 
			
 
				       - name: Get changed files
			
 
				         id: changed-files
			
 
				-        uses: tj-actions/changed-files@v29.0.4
			
 
				+        uses: tj-actions/changed-files@v41.0.0
			
 
				         with:
			
 
				           files: |
			
 
				             **/*.md
			
@@ -56,11 +56,11 @@ jobs:
 
				           if [ ! "$sources" ]; then
			
 
				             echo "No files to spellcheck"
			
 
				           else
			
 
				-            pyspelling -c $GITHUB_WORKSPACE/scripts/spellcheck_conf/spellcheck.yaml --name Markdown $sources
			
 
				+            pyspelling -c $GITHUB_WORKSPACE/.github/scripts/spellcheck_conf/spellcheck.yaml --name Markdown $sources
			
 
				           fi
			
 
				 
			
 
				       - name: In the case of misspellings
			
 
				         if: ${{ failure() }}
			
 
				         run: |
			
 
				           echo "Please fix the misspellings. If you are sure about some of them, "
			
 
				-          echo "so append those to scripts/spellcheck_conf/wordlist.txt"
			
 
				+          echo "so append those to .github/scripts/spellcheck_conf/wordlist.txt"
			
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -43,17 +43,17 @@ For development and contributing to llama-recipes please install from source wit
 
				 pip install -U pip setuptools
			
 
				 pip install --extra-index-url https://download.pytorch.org/whl/test/cu118 -e .[tests,auditnlg,vllm]
			
 
				 ```
			
 
				-The unit tests can be found in the [tests](./tests/) folder and you can run them from the main directory using:
			
 
				+The unit tests can be found in the [src/tests](./src/tests/) folder and you can run them from the main directory using:
			
 
				 ```
			
 
				-python -m pytest tests/
			
 
				+python -m pytest src/tests/
			
 
				 ```
			
 
				 To run all tests of a single file you can give the filename directly:
			
 
				 ```
			
 
				-python -m pytest tests/test_finetuning.py
			
 
				+python -m pytest src/tests/test_finetuning.py
			
 
				 ```
			
 
				 To run a specific test you can filter for its name with
			
 
				 ```
			
 
				-python -m pytest tests/test_finetuning.py -k test_finetuning_peft
			
 
				+python -m pytest src/tests/test_finetuning.py -k test_finetuning_peft
			
 
				 ```
			
 
				 To add a new test simply create a new test file under the tests folder (filename has to start with `test_`).
			
 
				 Group tests spanning the same feature in the same file and create a subfolder if the tests are very extensive.
			
--- a/README.md
+++ b/README.md
@@ -64,6 +64,10 @@ If you want to use PyTorch nightlies instead of the stable release, go to [this
 
				 ### Installing
			
 
				 Llama-recipes provides a pip distribution for easy install and usage in other projects. Alternatively, it can be installed from source.
			
 
				 
			
 
				+> [!NOTE]
			
 
				+> Ensure you use the correct CUDA version (from `nvidia-smi`) when installing the PyTorch wheels. Here we are using 11.8 as `cu118`.
			
 
				+> H100 GPUs work better with CUDA >12.0
			
 
				+
			
 
				 #### Install with pip
			
 
				 ```
			
 
				 pip install llama-recipes
			
--- a/docs/LLM_finetuning.md
+++ b/docs/LLM_finetuning.md
@@ -1,10 +1,10 @@
 
				 ## LLM Fine-Tuning
			
 
				 
			
 
				-Here we discuss fine-tuning Llama 2 with a couple of different recipes. We will cover two scenarios here:
			
 
				+Here we discuss fine-tuning Meta Llama 3 with a couple of different recipes. We will cover two scenarios here:
			
 
				 
			
 
				 
			
 
				 ## 1. **Parameter Efficient Model Fine-Tuning**
			
 
				- This helps make the fine-tuning process more affordable even on 1 consumer grade GPU. These methods enable us to keep the whole model frozen and to just add tiny learnable parameters/ layers into the model. In this way, we just train a very tiny portion of the parameters. The most famous method in this category is [LORA](https://arxiv.org/pdf/2106.09685.pdf), LLaMA Adapter and Prefix-tuning.
			
 
				+ This helps make the fine-tuning process more affordable even on 1 consumer grade GPU. These methods enable us to keep the whole model frozen and to just add tiny learnable parameters/ layers into the model. In this way, we just train a very tiny portion of the parameters. The most famous method in this category is [LORA](https://arxiv.org/pdf/2106.09685.pdf), Llama Adapter and Prefix-tuning.
			
 
				 
			
 
				 
			
 
				 These methods will address three aspects:
			
@@ -14,7 +14,7 @@ These methods will address three aspects:
 
				 
			
 
				 - **Cost of deployment** – for each fine-tuned downstream model we need to deploy a separate model; however, when using these methods, only a small set of parameters (few MB instead of several GBs) of the pretrained model can do the job. In this case, for each task we only add these extra parameters on top of the pretrained model so pretrained models can be assumed as backbone and these parameters as heads for the model on different tasks.
			
 
				 
			
 
				-- **Catastrophic forgetting** — these methods also help with forgetting the first task that can happen in fine-tunings.
			
 
				+- **Catastrophic forgetting** — these methods also help with forgetting the first task that can happen in fine-tuning.
			
 
				 
			
 
				 HF [PEFT](https://github.com/huggingface/peft) library provides an easy way of using these methods which we make use of here. Please read more [here](https://huggingface.co/blog/peft).
			
 
				 
			
@@ -42,7 +42,7 @@ You can also keep most of the layers frozen and only fine-tune a few layers. The
 
				 
			
 
				 
			
 
				 
			
 
				-In this scenario depending on the model size, you might need to go beyond one GPU, especially if your model does not fit into one GPU for training. In this case Llama 2 7B parameter won't fit into one gpu.
			
 
				+In this scenario depending on the model size, you might need to go beyond one GPU, especially if your model does not fit into one GPU for training. In this case Meta Llama 3 8B parameter won't fit into one gpu.
			
 
				 The way you want to think about it is, you would need enough GPU memory to keep model parameters, gradients and optimizer states. Where each of these, depending on the precision you are training, can take up multiple times of your parameter count x precision( depending on if its fp32/ 4 bytes, fp16/2 bytes/ bf16/2 bytes).
			
 
				 For example AdamW optimizer keeps 2 parameters for each of your parameters and in many cases these are kept in fp32. This implies that depending on how many layers you are training/ unfreezing your GPU memory can grow beyond one GPU.
			
 
				 
			
--- a/docs/multi_gpu.md
+++ b/docs/multi_gpu.md
--- a/docs/single_gpu.md
+++ b/docs/single_gpu.md
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
				 
			
 
				 [project]
			
 
				 name = "llama-recipes"
			
 
				-version = "0.0.1"
			
 
				+version = "0.0.2"
			
 
				 authors = [
			
 
				   { name="Hamid Shojanazeri", email="hamidnazeri@meta.com" },
			
 
				   { name="Matthias Reso", email="mreso@meta.com" },
			
--- a/recipes/README.md
+++ b/recipes/README.md
--- a/recipes/benchmarks/fmbench/README.md
+++ b/recipes/benchmarks/fmbench/README.md
--- a/recipes/benchmarks/fmbench/config.yml
+++ b/recipes/benchmarks/fmbench/config.yml
@@ -9,7 +9,7 @@ aws:
 
				   # SageMaker execution role used to run FMBench, this parameter is templatized, no need to change
			
 
				   sagemaker_execution_role: {role_arn}
			
 
				   # S3 bucket to which metrics, plots and reports would be written to
			
 
				-  bucket: {write_bucket} ## add the name of your desired bucket
			
 
				+  bucket: {write_bucket}
			
 
				 
			
 
				 # directory paths in the write bucket, no need to change these
			
 
				 dir_paths:
			
@@ -22,9 +22,10 @@ dir_paths:
 
				 
			
 
				 # S3 information for reading datasets, scripts and tokenizer
			
 
				 s3_read_data:
			
 
				-  # read bucket name, templatized, if left unchanged will default to sagemaker-fmbench-read-{region}-{account_id}
			
 
				+  # read bucket name, templatized, if left unchanged will default to sagemaker-fmbench-read-<region>-<account_id>
			
 
				   read_bucket: {read_bucket}
			
 
				-    
			
 
				+  scripts_prefix: scripts
			
 
				+  
			
 
				   # S3 prefix in the read bucket where deployment and inference scripts should be placed
			
 
				   scripts_prefix: scripts
			
 
				     
			
@@ -52,13 +53,12 @@ s3_read_data:
 
				   - narrativeqa.jsonl
			
 
				   - triviaqa_e.jsonl
			
 
				   - triviaqa.jsonl
			
 
				-
			
 
				   # S3 prefix for the tokenizer to be used with the models
			
 
				   # NOTE 1: the same tokenizer is used with all the models being tested through a config file
			
 
				   # NOTE 2: place your model specific tokenizers in a prefix named as <model_name>_tokenizer
			
 
				-  #         so the mistral tokenizer goes in mistral_tokenizer, Llama2 tokenizer goes in  llama2_tokenizer
			
 
				+  #         so the mistral tokenizer goes in mistral_tokenizer, Llama2 tokenizer goes in llama2_tokenizer and so on and so forth.
			
 
				   tokenizer_prefix: tokenizer
			
 
				-
			
 
				+  
			
 
				   # S3 prefix for prompt templates
			
 
				   prompt_template_dir: prompt_template
			
 
				 
			
@@ -79,7 +79,7 @@ run_steps:
 
				   4_model_metric_analysis.ipynb: yes
			
 
				   5_cleanup.ipynb: yes
			
 
				 
			
 
				-# dataset related configuration
			
 
				+
			
 
				 datasets:
			
 
				   # Refer to the 1_generate_data.ipynb notebook
			
 
				   # the dataset you use is expected to have the 
			
@@ -89,7 +89,7 @@ datasets:
 
				   prompt_template_keys:
			
 
				   - input
			
 
				   - context
			
 
				-
			
 
				+  
			
 
				   # if your dataset has multiple languages and it has a language
			
 
				   # field then you could filter it for a language. Similarly,
			
 
				   # you can filter your dataset to only keep prompts between
			
@@ -125,7 +125,7 @@ datasets:
 
				 # dataset which is listed below as the dataset_of_interest
			
 
				 metrics:
			
 
				   dataset_of_interest: en_2000-3000
			
 
				-  
			
 
				+
			
 
				 # all pricing information is in the pricing.yml file
			
 
				 # this file is provided in the repo. You can add entries
			
 
				 # to this file for new instance types and new Bedrock models
			
@@ -156,18 +156,18 @@ experiments:
 
				     # model_id is interpreted in conjunction with the deployment_script, so if you
			
 
				     # use a JumpStart model id then set the deployment_script to jumpstart.py.
			
 
				     # if deploying directly from HuggingFace this would be a HuggingFace model id
			
 
				-    # see the DJL serving deployment script in the code repo for reference.    
			
 
				+    # see the DJL serving deployment script in the code repo for reference.
			
 
				     model_id: meta-textgeneration-llama-2-7b-f
			
 
				     model_version: "3.*"
			
 
				     model_name: llama2-7b-f
			
 
				     ep_name: llama-2-7b-g5xlarge
			
 
				     instance_type: "ml.g5.xlarge"
			
 
				     image_uri: '763104351884.dkr.ecr.{region}.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.1-tgi1.1.0-gpu-py39-cu118-ubuntu20.04'
			
 
				-    deploy: yes    
			
 
				+    deploy: yes
			
 
				     instance_count: 1
			
 
				     # FMBench comes packaged with multiple deployment scripts, such as scripts for JumpStart
			
 
				     # scripts for deploying using DJL DeepSpeed, tensorRT etc. You can also add your own.
			
 
				-    # See repo for details
			
 
				+    # See repo for details    
			
 
				     deployment_script: jumpstart.py
			
 
				     # FMBench comes packaged with multiple inference scripts, such as scripts for SageMaker
			
 
				     # and Bedrock. You can also add your own. See repo for details
			
@@ -181,6 +181,7 @@ experiments:
 
				     - payload_en_500-1000.jsonl
			
 
				     - payload_en_1000-2000.jsonl
			
 
				     - payload_en_2000-3000.jsonl
			
 
				+    #- payload_en_3000-3840.jsonl
			
 
				     # concurrency level refers to number of requests sent in parallel to an endpoint
			
 
				     # the next set of requests is sent once responses for all concurrent requests have
			
 
				     # been received.
			
@@ -188,7 +189,7 @@ experiments:
 
				     - 1
			
 
				     - 2
			
 
				     - 4
			
 
				-    # Added for models that require accepting a EULA
			
 
				+
			
 
				     accept_eula: true
			
 
				     # Environment variables to be passed to the container
			
 
				     # this is not a fixed list, you can add more parameters as applicable.
			
@@ -204,6 +205,10 @@ experiments:
 
				       SAGEMAKER_MODEL_SERVER_WORKERS: "1"
			
 
				 
			
 
				   - name: llama2-7b-g5.2xlarge-huggingface-pytorch-tgi-inference-2.0.1-tgi1.1.0
			
 
				+    # model_id is interpreted in conjunction with the deployment_script, so if you
			
 
				+    # use a JumpStart model id then set the deployment_script to jumpstart.py.
			
 
				+    # if deploying directly from HuggingFace this would be a HuggingFace model id
			
 
				+    # see the DJL serving deployment script in the code repo for reference. 
			
 
				     model_id: meta-textgeneration-llama-2-7b-f
			
 
				     model_version: "3.*"
			
 
				     model_name: llama2-7b-f
			
@@ -211,23 +216,36 @@ experiments:
 
				     instance_type: "ml.g5.2xlarge"
			
 
				     image_uri: '763104351884.dkr.ecr.{region}.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.1-tgi1.1.0-gpu-py39-cu118-ubuntu20.04'
			
 
				     deploy: yes
			
 
				+    # FMBench comes packaged with multiple deployment scripts, such as scripts for JumpStart
			
 
				+    # scripts for deploying using DJL DeepSpeed, tensorRT etc. You can also add your own.
			
 
				+    # See repo for details
			
 
				     instance_count: 1
			
 
				     deployment_script: jumpstart.py
			
 
				+    # FMBench comes packaged with multiple inference scripts, such as scripts for SageMaker
			
 
				+    # and Bedrock. You can also add your own. See repo for details
			
 
				     inference_script: sagemaker_predictor.py
			
 
				     inference_spec:
			
 
				+      # this should match one of the sections in the inference_parameters section above
			
 
				       parameter_set: sagemaker
			
 
				+    # runs are done for each combination of payload file and concurrency level
			
 
				     payload_files:
			
 
				     - payload_en_1-500.jsonl
			
 
				     - payload_en_500-1000.jsonl
			
 
				     - payload_en_1000-2000.jsonl
			
 
				     - payload_en_2000-3000.jsonl
			
 
				-
			
 
				+    #- payload_en_3000-3840.jsonl
			
 
				+    
			
 
				+    # concurrency level refers to number of requests sent in parallel to an endpoint
			
 
				+    # the next set of requests is sent once responses for all concurrent requests have
			
 
				+    # been received.
			
 
				     concurrency_levels:
			
 
				     - 1
			
 
				     - 2
			
 
				     - 4
			
 
				-
			
 
				+    # Added for models that require accepting a EULA
			
 
				     accept_eula: true
			
 
				+    # Environment variables to be passed to the container
			
 
				+    # this is not a fixed list, you can add more parameters as applicable.
			
 
				     env:
			
 
				       SAGEMAKER_PROGRAM: "inference.py"
			
 
				       ENDPOINT_SERVER_TIMEOUT: "3600"
			
@@ -249,7 +267,6 @@ report:
 
				   latency_budget: 2
			
 
				   cost_per_10k_txn_budget: 20
			
 
				   error_rate_budget: 0
			
 
				-
			
 
				   # other misc reporting parameters, see 4_model_metric_analysis.ipynb
			
 
				   # for more information
			
 
				   per_inference_request_file: per_inference_request_results.csv
			
--- a/recipes/benchmarks/fmbench/img/business_summary.png
+++ b/recipes/benchmarks/fmbench/img/business_summary.png
--- a/recipes/benchmarks/inference_throughput/on-prem/README.md
+++ b/recipes/benchmarks/inference_throughput/on-prem/README.md
@@ -1,26 +1,26 @@
 
				 # Llama-On-Prem-Benchmark
			
 
				-This folder contains code to run inference benchmark for Llama 2 models on-prem with popular serving frameworks.
			
 
				-The benchmark will focus on overall inference **throughput** for running containers on one instance (single or multiple GPUs) that you can acquire from cloud service providers such as Azure and AWS. You can also run this benchmark on local laptop or desktop.  
			
 
				+This folder contains code to run inference benchmark for Meta Llama 3 models on-prem with popular serving frameworks.
			
 
				+The benchmark will focus on overall inference **throughput** for running containers on one instance (single or multiple GPUs) that you can acquire from cloud service providers such as Azure and AWS. You can also run this benchmark on local laptop or desktop.
			
 
				 We support benchmark on these serving framework:
			
 
				 * [vLLM](https://github.com/vllm-project/vllm)
			
 
				 
			
 
				 
			
 
				 # vLLM - Getting Started
			
 
				 
			
 
				-To get started, we first need to deploy containers on-prem as a API host. Follow the guidance [here](../../../inference/model_servers/llama-on-prem.md#setting-up-vllm-with-llama-2) to deploy vLLM on-prem.
			
 
				+To get started, we first need to deploy containers on-prem as a API host. Follow the guidance [here](../../../inference/model_servers/llama-on-prem.md#setting-up-vllm-with-llama-3) to deploy vLLM on-prem.
			
 
				 
			
 
				-Note that in common scenario which overall throughput is important, we suggest you prioritize deploying as many model replicas as possible to reach higher overall throughput and request-per-second (RPS), comparing to deploy one model container among multiple GPUs for model parallelism. Additionally, as deploying multiple model replicas, there is a need for a higher level wrapper to handle the load balancing which here has been simulated in the benchmark scripts.  
			
 
				-For example, we have an instance from Azure that has 8xA100 80G GPUs, and we want to deploy the Llama 2 70B chat model, which is around 140GB with FP16. So for deployment we can do:
			
 
				+Note that in common scenario which overall throughput is important, we suggest you prioritize deploying as many model replicas as possible to reach higher overall throughput and request-per-second (RPS), comparing to deploy one model container among multiple GPUs for model parallelism. Additionally, as deploying multiple model replicas, there is a need for a higher level wrapper to handle the load balancing which here has been simulated in the benchmark scripts.
			
 
				+For example, we have an instance from Azure that has 8xA100 80G GPUs, and we want to deploy the Meta Llama 3 70B instruct model, which is around 140GB with FP16. So for deployment we can do:
			
 
				 * 1x70B model parallel on 8 GPUs, each GPU RAM takes around 17.5GB for loading model weights.
			
 
				 * 2x70B models each use 4 GPUs, each GPU RAM takes around 35GB for loading model weights.
			
 
				 * 4x70B models each use 2 GPUs, each GPU RAM takes around 70GB for loading model weights. (Preferred configuration for max overall throughput. Note that you will have 4 endpoints hosted on different ports and the benchmark script will route requests into each model equally)
			
 
				 
			
 
				 Here are examples for deploying 2x70B chat models over 8 GPUs with vLLM.
			
 
				 ```
			
 
				-CUDA_VISIBLE_DEVICES=0,1,2,3 python -m vllm.entrypoints.openai.api_server  --model meta-llama/Llama-2-70b-chat-hf --tensor-parallel-size 4 --disable-log-requests --port 8000 
			
 
				-CUDA_VISIBLE_DEVICES=4,5,6,7 python -m vllm.entrypoints.openai.api_server  --model meta-llama/Llama-2-70b-chat-hf --tensor-parallel-size 4 --disable-log-requests --port 8001 
			
 
				+CUDA_VISIBLE_DEVICES=0,1,2,3 python -m vllm.entrypoints.openai.api_server  --model meta-llama/Meta-Llama-3-70B-Instruct --tensor-parallel-size 4 --disable-log-requests --port 8000
			
 
				+CUDA_VISIBLE_DEVICES=4,5,6,7 python -m vllm.entrypoints.openai.api_server  --model meta-llama/Meta-Llama-3-70B-Instruct --tensor-parallel-size 4 --disable-log-requests --port 8001
			
 
				 ```
			
 
				-Once you have finished deployment, you can use the command below to run benchmark scripts in a separate terminal. 
			
 
				+Once you have finished deployment, you can use the command below to run benchmark scripts in a separate terminal.
			
 
				 
			
 
				 ```
			
 
				 python chat_vllm_benchmark.py
			
@@ -32,9 +32,8 @@ If you are going to use [Azure AI content check](https://azure.microsoft.com/en-
 
				 pip install azure-ai-contentsafety azure-core
			
 
				 ```
			
 
				 Besides chat models, we also provide benchmark scripts for running pretrained models for text completion tasks. To better simulate the real traffic, we generate configurable random token prompt as input. In this process, we select vocabulary that is longer than 2 tokens so the generated words are closer to the English, rather than symbols.
			
 
				-However, random token prompts can't be applied for chat model benchmarks, since the chat model expects a valid question. By feeding random prompts, chat models rarely provide answers that is meeting our ```MAX_NEW_TOKEN``` requirement, defeating the purpose of running throughput benchmarks. Hence for chat models, the questions are copied over to form long inputs such as for 2k and 4k inputs.   
			
 
				+However, random token prompts can't be applied for chat model benchmarks, since the chat model expects a valid question. By feeding random prompts, chat models rarely provide answers that is meeting our ```MAX_NEW_TOKEN``` requirement, defeating the purpose of running throughput benchmarks. Hence for chat models, the questions are copied over to form long inputs such as for 2k and 4k inputs.
			
 
				 To run pretrained model benchmark, follow the command below.
			
 
				 ```
			
 
				 python pretrained_vllm_benchmark.py
			
 
				 ```
			
 
				-
			
--- a/recipes/benchmarks/inference_throughput/on-prem/vllm/chat_vllm_benchmark.py
+++ b/recipes/benchmarks/inference_throughput/on-prem/vllm/chat_vllm_benchmark.py
@@ -40,8 +40,6 @@ MODEL_HEADERS = params["MODEL_HEADERS"]
 
				 SAFE_CHECK = params["SAFE_CHECK"]
			
 
				 # Threshold for tokens per second below which we deem the query to be slow
			
 
				 THRESHOLD_TPS = params["THRESHOLD_TPS"] 
			
 
				-# Default Llama tokenizer, replace with your own tokenizer 
			
 
				-TOKENIZER_PATH = params["TOKENIZER_PATH"] 
			
 
				 TEMPERATURE = params["TEMPERATURE"]
			
 
				 TOP_P = params["TOP_P"]
			
 
				 # Add your model endpoints here, specify the port number. You can acquire the endpoint when creating a on-prem server like vLLM.
			
@@ -55,8 +53,8 @@ else:
 
				     print("No available GPUs")
			
 
				 
			
 
				 
			
 
				-# This tokenizer is downloaded from Azure model catalog for each specific models. The main purpose is to decode the reponses for token calculation
			
 
				-tokenizer = transformers.AutoTokenizer.from_pretrained(TOKENIZER_PATH)
			
 
				+# This tokenizer is downloaded from HuggingFace based on the model path you set. Note Llama 3 use a different tokenizer compare to Llama 2
			
 
				+tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_PATH)
			
 
				 
			
 
				 num_token_input_prompt = len(tokenizer.encode(PROMPT))
			
 
				 print(f"Number of token for input prompt: {num_token_input_prompt}")
			
--- a/recipes/benchmarks/inference_throughput/on-prem/vllm/parameters.json
+++ b/recipes/benchmarks/inference_throughput/on-prem/vllm/parameters.json
@@ -1,15 +1,14 @@
 
				 {
			
 
				     "MAX_NEW_TOKENS" : 256,
			
 
				     "CONCURRENT_LEVELS" : [1, 2, 4, 8, 16, 32, 64, 128, 256],
			
 
				-    "MODEL_PATH" : "meta-llama/Llama-2-7b-chat-hf",
			
 
				+    "MODEL_PATH" : "meta-llama/Meta-Llama-3-70B-Instruct",
			
 
				     "MODEL_HEADERS" : {"Content-Type": "application/json"},
			
 
				     "SAFE_CHECK" : true,
			
 
				     "THRESHOLD_TPS" : 7,
			
 
				-    "TOKENIZER_PATH" : "../../tokenizer",
			
 
				     "RANDOM_PROMPT_LENGTH" : 1000,
			
 
				     "TEMPERATURE" : 0.6,
			
 
				     "TOP_P" : 0.9,
			
 
				     "MODEL_ENDPOINTS" : [
			
 
				         "http://localhost:8000/v1/chat/completions"
			
 
				     ]
			
 
				-}
			
 
				+}
			
--- a/recipes/benchmarks/inference_throughput/on-prem/vllm/pretrained_vllm_benchmark.py
+++ b/recipes/benchmarks/inference_throughput/on-prem/vllm/pretrained_vllm_benchmark.py
@@ -36,8 +36,6 @@ MODEL_HEADERS = params["MODEL_HEADERS"]
 
				 SAFE_CHECK = params["SAFE_CHECK"]
			
 
				 # Threshold for tokens per second below which we deem the query to be slow
			
 
				 THRESHOLD_TPS = params["THRESHOLD_TPS"] 
			
 
				-# Replace with your own tokenizer 
			
 
				-TOKENIZER_PATH = params["TOKENIZER_PATH"] 
			
 
				 RANDOM_PROMPT_LENGTH = params["RANDOM_PROMPT_LENGTH"]
			
 
				 TEMPERATURE = params["TEMPERATURE"]
			
 
				 TOP_P = params["TOP_P"]
			
@@ -52,8 +50,8 @@ else:
 
				     print("No available GPUs")
			
 
				 
			
 
				 
			
 
				-# This tokenizer is downloaded from Azure model catalog for each specific models. The main purpose is to decode the reponses for token calculation
			
 
				-tokenizer = transformers.AutoTokenizer.from_pretrained(TOKENIZER_PATH)
			
 
				+# This tokenizer is downloaded from HuggingFace based on the model path you set. Note Llama 3 use a different tokenizer compare to Llama 2
			
 
				+tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_PATH)
			
 
				 
			
 
				 # Select vocabulary that is longer than 2 tokens (closer to real words) and close to the English (not foolproof)
			
 
				 vocab = [token for token in tokenizer.get_vocab().keys() if len(token) > 2 and all(ord(c) < 128 for c in token)]
			
--- a/recipes/evaluation/README.md
+++ b/recipes/evaluation/README.md
@@ -28,7 +28,7 @@ Before running the evaluation script, ensure you have all the necessary dependen
 
				 Clone the lm-evaluation-harness repository and install it:
			
 
				 
			
 
				 ```bash
			
 
				-git clone https://github.com/matthoffner/lm-evaluation-harness.git
			
 
				+git clone https://github.com/EleutherAI/lm-evaluation-harness.git
			
 
				 cd lm-evaluation-harness
			
 
				 pip install -e .
			
 
				 
			
--- a/recipes/evaluation/eval.py
+++ b/recipes/evaluation/eval.py
@@ -11,7 +11,7 @@ from pathlib import Path
 
				 
			
 
				 import numpy as np
			
 
				 import lm_eval
			
 
				-from lm_eval import evaluator, tasks
			
 
				+from lm_eval import tasks
			
 
				 from lm_eval.utils import make_table
			
 
				 
			
 
				 
			
@@ -73,12 +73,11 @@ def handle_output(args, results, logger):
 
				 
			
 
				 
			
 
				 def load_tasks(args):
			
 
				-    tasks.initialize_tasks()
			
 
				     if args.open_llm_leaderboard_tasks:
			
 
				         current_dir = os.getcwd()
			
 
				         config_dir = os.path.join(current_dir, "open_llm_leaderboard")
			
 
				-        lm_eval.tasks.include_path(config_dir)
			
 
				-        return [
			
 
				+        task_manager = tasks.TaskManager(include_path=config_dir)
			
 
				+        return task_manager, [
			
 
				             "arc_challenge_25_shot",
			
 
				             "hellaswag_10_shot",
			
 
				             "truthfulqa_mc2",
			
@@ -86,7 +85,7 @@ def load_tasks(args):
 
				             "gsm8k",
			
 
				             "mmlu",
			
 
				         ]
			
 
				-    return args.tasks.split(",") if args.tasks else []
			
 
				+    return None, args.tasks.split(",") if args.tasks else []
			
 
				 
			
 
				 
			
 
				 def parse_eval_args():
			
@@ -190,21 +189,18 @@ def parse_eval_args():
 
				         default=None,
			
 
				         help="Additional path to include if there are external tasks.",
			
 
				     )
			
 
				-    parser.add_argument(
			
 
				-        "--decontamination_ngrams_path", default=None
			
 
				-    )  # Not currently used
			
 
				     return parser.parse_args()
			
 
				 
			
 
				 
			
 
				 def evaluate_model(args):
			
 
				     try:
			
 
				-        task_list = load_tasks(args)
			
 
				+        task_manager, task_list = load_tasks(args)
			
 
				         # Customized model such as Quantized model etc.
			
 
				         # In case you are working with a custom model, you can use the following guide to add it here:
			
 
				         # https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage
			
 
				 
			
 
				         # Evaluate
			
 
				-        results = evaluator.simple_evaluate(
			
 
				+        results = lm_eval.simple_evaluate(
			
 
				             model=args.model,
			
 
				             model_args=args.model_args,
			
 
				             tasks=task_list,
			
@@ -214,11 +210,11 @@ def evaluate_model(args):
 
				             device=args.device,
			
 
				             use_cache=args.use_cache,
			
 
				             limit=args.limit,
			
 
				-            decontamination_ngrams_path=args.decontamination_ngrams_path,
			
 
				             check_integrity=args.check_integrity,
			
 
				             write_out=args.write_out,
			
 
				             log_samples=args.log_samples,
			
 
				             gen_kwargs=args.gen_kwargs,
			
 
				+            task_manager=task_manager,
			
 
				         )
			
 
				         handle_output(args, results, logger)
			
 
				 
			
--- a/recipes/finetuning/LLM_finetuning_overview.md
+++ b/recipes/finetuning/LLM_finetuning_overview.md
@@ -1,6 +1,6 @@
 
				 ## LLM Fine-Tuning
			
 
				 
			
 
				-Here we discuss fine-tuning Llama 2 with a couple of different recipes. We will cover two scenarios here:
			
 
				+Here we discuss fine-tuning Meta Llama 3 with a couple of different recipes. We will cover two scenarios here:
			
 
				 
			
 
				 
			
 
				 ## 1. **Parameter Efficient Model Fine-Tuning**
			
@@ -42,7 +42,7 @@ You can also keep most of the layers frozen and only fine-tune a few layers. The
 
				 
			
 
				 
			
 
				 
			
 
				-In this scenario depending on the model size, you might need to go beyond one GPU, especially if your model does not fit into one GPU for training. In this case Llama 2 7B parameter won't fit into one gpu.
			
 
				+In this scenario depending on the model size, you might need to go beyond one GPU, especially if your model does not fit into one GPU for training. In this case Meta Llama 3 8B parameter won't fit into one gpu.
			
 
				 The way you want to think about it is, you would need enough GPU memory to keep model parameters, gradients and optimizer states. Where each of these, depending on the precision you are training, can take up multiple times of your parameter count x precision( depending on if its fp32/ 4 bytes, fp16/2 bytes/ bf16/2 bytes).
			
 
				 For example AdamW optimizer keeps 2 parameters for each of your parameters and in many cases these are kept in fp32. This implies that depending on how many layers you are training/ unfreezing your GPU memory can grow beyond one GPU.
			
 
				 
			
--- a/recipes/finetuning/README.md
+++ b/recipes/finetuning/README.md
--- a/recipes/finetuning/multigpu_finetuning.md
+++ b/recipes/finetuning/multigpu_finetuning.md
--- a/recipes/finetuning/singlegpu_finetuning.md
+++ b/recipes/finetuning/singlegpu_finetuning.md
--- a/recipes/inference/llama_web_ui/Llama2_Gradio.ipynb
+++ b/recipes/inference/llama_web_ui/Llama2_Gradio.ipynb
@@ -1,130 +0,0 @@
 
				-{
			
 
				- "cells": [
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "e4532411",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# TODO REFACTOR: Integrate code from _legacy/inference.py into this notebook"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "47a9adb3",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "## This demo app shows how to query Llama 2 using the Gradio UI.\n",
			
 
				-    "\n",
			
 
				-    "Since we are using Replicate in this example, you will need to replace `<your replicate api token>` with your API token.\n",
			
 
				-    "\n",
			
 
				-    "To get the Replicate token: \n",
			
 
				-    "\n",
			
 
				-    "- You will need to first sign in with Replicate with your github account\n",
			
 
				-    "- Then create a free API token [here](https://replicate.com/account/api-tokens) that you can use for a while \n",
			
 
				-    "\n",
			
 
				-    "**Note** After the free trial ends, you will need to enter billing info to continue to use Llama2 hosted on Replicate.\n",
			
 
				-    "\n",
			
 
				-    "To run this example:\n",
			
 
				-    "- Set up your Replicate API token and enter it in place of `<your replicate api token>`\n",
			
 
				-    "- Run the notebook\n",
			
 
				-    "- Enter your question and click Submit\n",
			
 
				-    "\n",
			
 
				-    "In the notebook or a browser with URL http://127.0.0.1:7860 you should see a UI with your answer."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": 1,
			
 
				-   "id": "928041cc",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [
			
 
				-    {
			
 
				-     "name": "stderr",
			
 
				-     "output_type": "stream",
			
 
				-     "text": [
			
 
				-      "Init param `input` is deprecated, please use `model_kwargs` instead.\n"
			
 
				-     ]
			
 
				-    },
			
 
				-    {
			
 
				-     "name": "stdout",
			
 
				-     "output_type": "stream",
			
 
				-     "text": [
			
 
				-      "Running on local URL:  http://127.0.0.1:7860\n",
			
 
				-      "\n",
			
 
				-      "To create a public link, set `share=True` in `launch()`.\n"
			
 
				-     ]
			
 
				-    },
			
 
				-    {
			
 
				-     "data": {
			
 
				-      "text/html": [
			
 
				-       "<div><iframe src=\"http://127.0.0.1:7860/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
			
 
				-      ],
			
 
				-      "text/plain": [
			
 
				-       "<IPython.core.display.HTML object>"
			
 
				-      ]
			
 
				-     },
			
 
				-     "metadata": {},
			
 
				-     "output_type": "display_data"
			
 
				-    },
			
 
				-    {
			
 
				-     "data": {
			
 
				-      "text/plain": []
			
 
				-     },
			
 
				-     "execution_count": 1,
			
 
				-     "metadata": {},
			
 
				-     "output_type": "execute_result"
			
 
				-    }
			
 
				-   ],
			
 
				-   "source": [
			
 
				-    "from langchain.schema import AIMessage, HumanMessage\n",
			
 
				-    "import gradio as gr\n",
			
 
				-    "from langchain.llms import Replicate\n",
			
 
				-    "import os\n",
			
 
				-    "\n",
			
 
				-    "os.environ[\"REPLICATE_API_TOKEN\"] = \"<your replicate api token>\"\n",
			
 
				-    "\n",
			
 
				-    "llama2_13b_chat = \"meta/llama-2-13b-chat:f4e2de70d66816a838a89eeeb621910adffb0dd0baba3976c96980970978018d\"\n",
			
 
				-    "\n",
			
 
				-    "llm = Replicate(\n",
			
 
				-    "    model=llama2_13b_chat,\n",
			
 
				-    "    model_kwargs={\"temperature\": 0.01, \"top_p\": 1, \"max_new_tokens\":500}\n",
			
 
				-    ")\n",
			
 
				-    "\n",
			
 
				-    "\n",
			
 
				-    "def predict(message, history):\n",
			
 
				-    "    history_langchain_format = []\n",
			
 
				-    "    for human, ai in history:\n",
			
 
				-    "        history_langchain_format.append(HumanMessage(content=human))\n",
			
 
				-    "        history_langchain_format.append(AIMessage(content=ai))\n",
			
 
				-    "    history_langchain_format.append(HumanMessage(content=message))\n",
			
 
				-    "    gpt_response = llm(message) #history_langchain_format)\n",
			
 
				-    "    return gpt_response#.content\n",
			
 
				-    "\n",
			
 
				-    "gr.ChatInterface(predict).launch()"
			
 
				-   ]
			
 
				-  }
			
 
				- ],
			
 
				- "metadata": {
			
 
				-  "kernelspec": {
			
 
				-   "display_name": "Python 3 (ipykernel)",
			
 
				-   "language": "python",
			
 
				-   "name": "python3"
			
 
				-  },
			
 
				-  "language_info": {
			
 
				-   "codemirror_mode": {
			
 
				-    "name": "ipython",
			
 
				-    "version": 3
			
 
				-   },
			
 
				-   "file_extension": ".py",
			
 
				-   "mimetype": "text/x-python",
			
 
				-   "name": "python",
			
 
				-   "nbconvert_exporter": "python",
			
 
				-   "pygments_lexer": "ipython3",
			
 
				-   "version": "3.8.18"
			
 
				-  }
			
 
				- },
			
 
				- "nbformat": 4,
			
 
				- "nbformat_minor": 5
			
 
				-}
			
--- a/recipes/inference/llama_web_ui/README.md
+++ b/recipes/inference/llama_web_ui/README.md
@@ -1,25 +0,0 @@
 
				-## Quick Web UI for Llama2 Chat
			
 
				-If you prefer to see Llama2 in action in a web UI, instead of the notebooks above, you can try one of the two methods:
			
 
				-
			
 
				-### Running [Streamlit](https://streamlit.io/) with Llama2
			
 
				-Open a Terminal, run the following commands:
			
 
				-```
			
 
				-pip install streamlit langchain replicate
			
 
				-git clone https://github.com/facebookresearch/llama-recipes
			
 
				-cd llama-recipes/llama-demo-apps
			
 
				-```
			
 
				-
			
 
				-Replace the `<your replicate api token>` in `streamlit_llama2.py` with your API token created [here](https://replicate.com/account/api-tokens) - for more info, see the note [above](#replicate_note).
			
 
				-
			
 
				-Then run the command `streamlit run streamlit_llama2.py` and you'll see on your browser the following UI with question and answer - you can enter new text question, click Submit, and see Llama2's answer:
			
 
				-
			
 
				-![](../../../docs/images/llama2-streamlit.png)
			
 
				-![](../../../docs/images/llama2-streamlit2.png)
			
 
				-
			
 
				-### Running [Gradio](https://www.gradio.app/) with Llama2 (using [Replicate](Llama2_Gradio.ipynb) or [OctoAI](../../llama_api_providers/OctoAI_API_examples/Llama2_Gradio.ipynb))
			
 
				-
			
 
				-To see how to query Llama2 and get answers with the Gradio UI both from the notebook and web, just launch the notebook `Llama2_Gradio.ipynb`. For more info, on how to get set up with a token to power these apps, see the note on [Replicate](../../README.md#replicate_note) and [OctoAI](../../README.md##octoai_note).
			
 
				-
			
 
				-Then enter your question, click Submit. You'll see in the notebook or a browser with URL http://127.0.0.1:7860 the following UI:
			
 
				-
			
 
				-![](../../../docs/images/llama2-gradio.png)
			
--- a/recipes/inference/llama_web_ui/requirements.txt
+++ b/recipes/inference/llama_web_ui/requirements.txt
@@ -1,3 +0,0 @@
 
				-streamlit
			
 
				-langchain
			
 
				-replicate
			
--- a/recipes/inference/llama_web_ui/streamlit_llama2.py
+++ b/recipes/inference/llama_web_ui/streamlit_llama2.py
@@ -1,27 +0,0 @@
 
				-# Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				-# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
			
 
				-
			
 
				-# TODO REFACTOR: Convert this to an ipynb notebook
			
 
				-
			
 
				-import streamlit as st
			
 
				-from langchain.llms import Replicate
			
 
				-import os
			
 
				-
			
 
				-st.title("Llama2-powered Streamlit App")
			
 
				-
			
 
				-with st.sidebar:
			
 
				-    os.environ["REPLICATE_API_TOKEN"] = "<your replicate api token>"
			
 
				-
			
 
				-def generate_response(input_text):
			
 
				-    llama2_13b_chat = "meta/llama-2-13b-chat:f4e2de70d66816a838a89eeeb621910adffb0dd0baba3976c96980970978018d"
			
 
				-
			
 
				-    llm = Replicate(
			
 
				-        model=llama2_13b_chat,
			
 
				-        model_kwargs={"temperature": 0.01, "top_p": 1, "max_new_tokens":500}
			
 
				-    )
			
 
				-    st.info(llm(input_text))
			
 
				-
			
 
				-with st.form("my_form"):
			
 
				-    text = st.text_area("Enter text:", "What is Generative AI?")
			
 
				-    submitted = st.form_submit_button("Submit")
			
 
				-    generate_response(text)
			
--- a/recipes/inference/local_inference/README.md
+++ b/recipes/inference/local_inference/README.md
@@ -69,7 +69,7 @@ In case you have fine-tuned your model with pure FSDP and saved the checkpoints
 
				 This is helpful if you have fine-tuned you model using FSDP only as follows:
			
 
				 
			
 
				 ```bash
			
 
				-torchrun --nnodes 1 --nproc_per_node 8  recipes/finetuning/finetuning.py --enable_fsdp --model_name /patht_of_model_folder/7B --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16
			
 
				+torchrun --nnodes 1 --nproc_per_node 8  recipes/finetuning/finetuning.py --enable_fsdp --model_name /path_of_model_folder/7B --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16
			
 
				 ```
			
 
				 Then convert your FSDP checkpoint to HuggingFace checkpoints using:
			
 
				 ```bash
			
--- a/recipes/inference/mobile_inference/android_inference/README.md
+++ b/recipes/inference/mobile_inference/android_inference/README.md
@@ -0,0 +1,147 @@
 
				+# Running Llama3 8B Instruct on Android with MLC-LLM
			
 
				+
			
 
				+Author: Thierry Moreau - tmoreau@octo.ai
			
 
				+
			
 
				+# Overview
			
 
				+In this tutorial we'll learn how to deploy Llama3 8B Instruct on an Android-based phone using MLC-LLM.
			
 
				+
			
 
				+Machine Learning Compilation for Large Language Models (MLC LLM) is a high-performance universal deployment solution that allows native deployment of any large language models with native APIs with compiler acceleration. The mission of this project is to enable everyone to develop, optimize and deploy AI models natively on everyone's devices with ML compilation techniques.
			
 
				+
			
 
				+You can read more about MLC-LLM at the following [link](https://github.com/mlc-ai/mlc-llm).
			
 
				+
			
 
				+MLC-LLM is also what powers the Llama3 inference APIs provided by [OctoAI](https://octo.ai/). You can use OctoAI for your Llama3 cloud-based inference needs by trying out the examples under the [following path](../../../llama_api_providers/OctoAI_API_examples/).
			
 
				+
			
 
				+This tutorial was tested with the following setup:
			
 
				+* MacBook Pro 16 inch from 2021 with Apple M1 Max and 32GB of RAM running Sonoma 14.3.1
			
 
				+* OnePlus 12 Android Smartphone with a Snapdragon 8Gen3 SoC and 12GB or RAM, running OxygenOS 14.0
			
 
				+
			
 
				+Running Llama3 on a phone will likely require a powerful chipset. We haven't tested extensively the range of chipset that will support this usecase. Feel free to update this README.md to specify what devices were successfully tested.
			
 
				+
			
 
				+| Phone      | Chipset          | RAM  | Status  | Comments |
			
 
				+|------------|------------------|------|---------|----------|
			
 
				+| OnePlus 12 | Snapdragon 8Gen3 | 12GB | Success | None     |
			
 
				+|            |                  |      |         |          |
			
 
				+
			
 
				+This guide is heavily based on the [MLC Android Guide](https://llm.mlc.ai/docs/deploy/android.html), but several steps have been taken to streamline the instructions.
			
 
				+
			
 
				+# Pre-requisites
			
 
				+
			
 
				+## Python
			
 
				+
			
 
				+Whether you're using conda or virtual env to manage your environment, we highly recommend starting from scratch with a clean new environment.
			
 
				+
			
 
				+For instance with virtual environment:
			
 
				+```bash
			
 
				+python3 -m venv .venv
			
 
				+source .venv/bin/activate
			
 
				+```
			
 
				+
			
 
				+Next you'll need to install the following packages:
			
 
				+```bash
			
 
				+python3 -m pip install -r requirements.txt
			
 
				+```
			
 
				+
			
 
				+## Rust
			
 
				+
			
 
				+[Rust](https://www.rust-lang.org/tools/install) is needed to cross-compile HuggingFace tokenizers to Android.
			
 
				+Make sure rustc, cargo, and rustup are available in $PATH.
			
 
				+
			
 
				+
			
 
				+## Android Studio
			
 
				+
			
 
				+Install Android Studio from <!-- markdown-link-check-disable -->https://developer.android.com/studio<!-- markdown-link-check-enable --> with NDK and CMake.
			
 
				+
			
 
				+To install NDK and CMake, in the Android Studio welcome page, click “Projects → SDK Manager → SDK Tools”. Set up the following environment variables:
			
 
				+
			
 
				+* ANDROID_NDK so that $ANDROID_NDK/build/cmake/android.toolchain.cmake is available.
			
 
				+* TVM_NDK_CC that points to NDK's clang compiler.
			
 
				+
			
 
				+For instance, the paths will look like the following on OSX for user `moreau`:
			
 
				+```bash
			
 
				+# Android + TVM setup
			
 
				+export ANDROID_NDK="/Users/moreau/Library/Android/sdk/ndk/26.1.10909125"
			
 
				+export TVM_NDK_CC="$ANDROID_NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android24-clang"
			
 
				+```
			
 
				+
			
 
				+This tutorial was tested successfully on Android Studio Hedgehog | 2023.1.1 Patch 1.
			
 
				+
			
 
				+## JDK
			
 
				+
			
 
				+JDK, such as OpenJDK >= 17, to compile Java bindings of TVM Unity runtime.
			
 
				+
			
 
				+We strongly recommend setting the JAVA_HOME to the JDK bundled with Android Studio. Using Android Studio’s JBR bundle as recommended (<!-- markdown-link-check-disable -->https://developer.android.com/build/jdks<!-- markdown-link-check-enable -->) will reduce the chances of potential errors in JNI compilation.
			
 
				+
			
 
				+For instance on macOS, you'll need to point JAVA_HOME to the following.
			
 
				+
			
 
				+```bash
			
 
				+export JAVA_HOME=/Applications/Android\ Studio.app/Contents/jbr/Contents/Home
			
 
				+```
			
 
				+
			
 
				+To make sure the java binary can be found do an `ls $JAVA_HOME/bin/java`
			
 
				+
			
 
				+## MLC-LLM
			
 
				+
			
 
				+Let's clone mlc-llm from its repo in the directory of your choice:
			
 
				+
			
 
				+```bash
			
 
				+cd /path/to/where/to/clone/repo
			
 
				+git clone https://github.com/mlc-ai/mlc-llm --recursive
			
 
				+export MLC_LLM_HOME=/path/to/mlc-llm
			
 
				+```
			
 
				+
			
 
				+At the time of writing this README, we tested `mlc-llm` at the following sha: `21feb7010db02e0c2149489f5972d6a8a796b5a0`.
			
 
				+
			
 
				+## Phone Setup
			
 
				+
			
 
				+On your phone, enable debugging on your phone in your phone’s developer settings. Each phone manufacturer will have its own approach to enabling debug mode, so a simple Google search should equip you with the steps to do that on your phone.
			
 
				+
			
 
				+In addition, make sure to change your USB configuration from "Charging" to "MTP (Media Transfer Protocol)". This will allow us to connect to the device serially.
			
 
				+
			
 
				+Connect your phone to your development machine. On OSX, you'll be prompted on the dev machine whether you want to allow the accessory to connect. Hit "Allow".
			
 
				+
			
 
				+# Build Steps
			
 
				+
			
 
				+## Building the Android Package with MLC
			
 
				+
			
 
				+First edit the file under `android/MLCChat/mlc-package-config.json` and with the [mlc-package-config.json](./mlc-package-config.json) in llama-recipes.
			
 
				+
			
 
				+To understand what these JSON fields mean you can refer to this [documentation](https://llm.mlc.ai/docs/deploy/android.html#step-2-build-runtime-and-model-libraries).
			
 
				+
			
 
				+
			
 
				+From the `mlc-llm` project root directory:
			
 
				+
			
 
				+```bash
			
 
				+cd $MLC_LLM_HOME
			
 
				+cd android/MLCChat
			
 
				+python3 -m mlc_llm package  --package-config mlc-package-config.json --output dist
			
 
				+```
			
 
				+
			
 
				+The command above will take a few minutes to run as it runs through the following steps:
			
 
				+
			
 
				+* Compile the Llama 3 8B instruct specified in the `mlc-package-config.json` into a binary model library.
			
 
				+* Build the `mlc-llm` runtime and tokenizer. In addition to the model itself, a lightweight runtime and tokenizer are required to actually run the LLM.
			
 
				+
			
 
				+## Building and Running MLC Chat in Android Studio
			
 
				+
			
 
				+Now let's launch Android Studio.
			
 
				+
			
 
				+* On the "Welcome to Android Studio" page, hit "Open", and navigate to `$MLC_LLM_HOME/android/MLCChat`, then hit "Open"
			
 
				+* A window will pop up asking whether to "Trust and Open project 'MLCChat'" - hit "Trust Project"
			
 
				+* The project will now launch
			
 
				+* Under File -> Project Structure... -> Project change the Gradle Version (second drop down from the top) to 8.5
			
 
				+
			
 
				+Connect your phone to your development machine - assuming you've followed the setup steps in the pre-requisite section, you should be able to see the device.
			
 
				+
			
 
				+Next you'll need to:
			
 
				+
			
 
				+* Hit Build -> Make Project.
			
 
				+* Hit Run -> Run 'app'
			
 
				+
			
 
				+The MLCChat app will launch on your phone, now access your phone:
			
 
				+
			
 
				+* Under Model List you'll see the `Llama-3-8B-Instruct` LLM listed.
			
 
				+* The model's not quite ready to launch yet, because the weights need to be downloaded over Wifi first. Hit the Download button on the right to the model name to download the weights from HuggingFace.
			
 
				+
			
 
				+Note that you can change the build settings to bundle the weights with the MLCChat app so you don't have to download the weights over wifi. To do so you can follow the instructions [here](https://llm.mlc.ai/docs/deploy/android.html#bundle-model-weights).
			
 
				+
			
 
				+Once the model weights are downloaded you can now interact with Llama 3 locally on your Android phone!
			
--- a/recipes/inference/mobile_inference/android_inference/mlc-package-config.json
+++ b/recipes/inference/mobile_inference/android_inference/mlc-package-config.json
@@ -0,0 +1,14 @@
 
				+{
			
 
				+    "device": "android",
			
 
				+    "model_list": [
			
 
				+        {
			
 
				+            "model": "HF://mlc-ai/Llama-3-8B-Instruct-q4f16_1-MLC",
			
 
				+            "estimated_vram_bytes": 4348727787,
			
 
				+            "model_id": "Llama-3-8B-Instruct",
			
 
				+            "overrides": {
			
 
				+                "context_window_size": 768,
			
 
				+                "prefill_chunk_size": 256
			
 
				+            }
			
 
				+        }
			
 
				+    ]
			
 
				+}
			
--- a/recipes/inference/mobile_inference/android_inference/requirements.txt
+++ b/recipes/inference/mobile_inference/android_inference/requirements.txt
@@ -0,0 +1,14 @@
 
				+--pre
			
 
				+--find-links https://mlc.ai/wheels
			
 
				+mlc-llm-nightly
			
 
				+mlc-ai-nightly
			
 
				+attrs
			
 
				+decorator
			
 
				+numpy
			
 
				+psutil
			
 
				+pydantic
			
 
				+requests
			
 
				+scipy
			
 
				+setuptools
			
 
				+torch
			
 
				+tqdm
			
--- a/recipes/inference/model_servers/README.md
+++ b/recipes/inference/model_servers/README.md
@@ -1,4 +1,2 @@
 
				-## [Running Llama2 On-Prem with vLLM and TGI](llama-on-prem.md)
			
 
				-This tutorial shows how to use Llama 2 with [vLLM](https://github.com/vllm-project/vllm) and Hugging Face [TGI](https://github.com/huggingface/text-generation-inference) to build Llama 2 on-prem apps.
			
 
				-
			
 
				-\* To run a quantized Llama2 model on iOS and Android, you can use  the open source [MLC LLM](https://github.com/mlc-ai/mlc-llm) or [llama.cpp](https://github.com/ggerganov/llama.cpp). You can even make a Linux OS that boots to Llama2 ([repo](https://github.com/trholding/llama2.c)).
			
 
				+## [Running Llama 3 On-Prem with vLLM and TGI](llama-on-prem.md)
			
 
				+This tutorial shows how to use Llama 3 with [vLLM](https://github.com/vllm-project/vllm) and Hugging Face [TGI](https://github.com/huggingface/text-generation-inference) to build Llama 3 on-prem apps.
			
--- a/recipes/inference/model_servers/llama-on-prem.md
+++ b/recipes/inference/model_servers/llama-on-prem.md
--- a/recipes/llama_api_providers/Azure_API_example/azure_api_example.ipynb
+++ b/recipes/llama_api_providers/Azure_API_example/azure_api_example.ipynb
@@ -4,13 +4,14 @@
 
				    "cell_type": "markdown",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				-    "# Use Azure API with Llama 2\n",
			
 
				+    "# Use Azure API with Llama 3\n",
			
 
				     "\n",
			
 
				-    "This notebook shows examples of how to use Llama 2 APIs offered by Microsoft Azure. We will cover:  \n",
			
 
				-    "* HTTP requests API usage for Llama 2 pretrained and chat models in CLI\n",
			
 
				-    "* HTTP requests API usage for Llama 2 pretrained and chat models in Python\n",
			
 
				+    "This notebook shows examples of how to use Llama 3 APIs offered by Microsoft Azure. We will cover:  \n",
			
 
				+    "* HTTP requests API usage for Llama 3 instruct models in CLI\n",
			
 
				+    "* HTTP requests API usage for Llama 3 instruct models in Python\n",
			
 
				     "* Plug the APIs into LangChain\n",
			
 
				     "* Wire the model with Gradio to build a simple chatbot with memory\n",
			
 
				+    "\n",
			
 
				     "\n"
			
 
				    ]
			
 
				   },
			
@@ -20,15 +21,13 @@
 
				    "source": [
			
 
				     "## Prerequisite\n",
			
 
				     "\n",
			
 
				-    "Before we start building with Azure Llama 2 APIs, there are certain steps we need to take to deploy the models:\n",
			
 
				+    "Before we start building with Azure Llama 3 APIs, there are certain steps we need to take to deploy the models:\n",
			
 
				     "\n",
			
 
				     "* Register for a valid Azure account with subscription [here](https://azure.microsoft.com/en-us/free/search/?ef_id=_k_CjwKCAiA-P-rBhBEEiwAQEXhH5OHAJLhzzcNsuxwpa5c9EJFcuAjeh6EvZw4afirjbWXXWkiZXmU2hoC5GoQAvD_BwE_k_&OCID=AIDcmm5edswduu_SEM__k_CjwKCAiA-P-rBhBEEiwAQEXhH5OHAJLhzzcNsuxwpa5c9EJFcuAjeh6EvZw4afirjbWXXWkiZXmU2hoC5GoQAvD_BwE_k_&gad_source=1&gclid=CjwKCAiA-P-rBhBEEiwAQEXhH5OHAJLhzzcNsuxwpa5c9EJFcuAjeh6EvZw4afirjbWXXWkiZXmU2hoC5GoQAvD_BwE)\n",
			
 
				     "* Take a quick look on what is the [Azure AI Studio](https://learn.microsoft.com/en-us/azure/ai-studio/what-is-ai-studio?tabs=home) and navigate to the website from the link in the article\n",
			
 
				     "* Follow the demos in the article to create a project and [resource](https://learn.microsoft.com/en-us/azure/azure-resource-manager/management/manage-resource-groups-portal) group, or you can also follow the guide [here](https://learn.microsoft.com/en-us/azure/ai-studio/how-to/deploy-models-llama?tabs=azure-studio)\n",
			
 
				-    "* Select Llama models from Model catalog\n",
			
 
				-    "* Deploy with \"Pay-as-you-go\"\n",
			
 
				-    "\n",
			
 
				-    "Once deployed successfully, you should be assigned for an API endpoint and a security key for inference.  \n",
			
 
				+    "* For Llama 3 instruct models from Model catalog, click Deploy in the model page and select \"Pay-as-you-go\". Once deployed successfully, you should be assigned for an API endpoint and a security key for inference.\n",
			
 
				+    "* For Llama 3 pretrained models, Azure currently only support manual deployment under regular subscription. We are working with them to bring \"Pay-as-you-go\" for pretrained models.\n",
			
 
				     "\n",
			
 
				     "For more information, you should consult Azure's official documentation [here](https://learn.microsoft.com/en-us/azure/ai-studio/how-to/deploy-models-llama?tabs=azure-studio) for model deployment and inference."
			
 
				    ]
			
@@ -41,10 +40,12 @@
 
				     "\n",
			
 
				     "### Basics\n",
			
 
				     "\n",
			
 
				+    "The usage and schema of the API are identical to Llama 3 API hosted on Azure.\n",
			
 
				+    "\n",
			
 
				     "For using the REST API, You will need to have an Endpoint url and Authentication Key associated with that endpoint.  \n",
			
 
				     "This can be acquired from previous steps.  \n",
			
 
				     "\n",
			
 
				-    "In this text completion example for pre-trained model, we use a simple curl call for illustration. There are three major components:  \n",
			
 
				+    "In this chat completion example for instruct model, we use a simple curl call for illustration. There are three major components:  \n",
			
 
				     "\n",
			
 
				     "* The `host-url` is your endpoint url with completion schema. \n",
			
 
				     "* The `headers` defines the content type as well as your api key. \n",
			
@@ -52,20 +53,9 @@
 
				    ]
			
 
				   },
			
 
				   {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "!curl -X POST -L https://your-endpoint.inference.ai.azure.com/v1/completions -H 'Content-Type: application/json' -H 'Authorization: your-auth-key' -d '{\"prompt\": \"Math is a\", \"max_tokens\": 30, \"temperature\": 0.7}' "
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				    "cell_type": "markdown",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				-    "For chat completion, the API schema and request payload are slightly different.\n",
			
 
				-    "\n",
			
 
				     "The `host-url` needs to be `/v1/chat/completions` and the request payload to include roles in conversations. Here is a sample payload:  \n",
			
 
				     "\n",
			
 
				     "```\n",
			
@@ -100,18 +90,6 @@
 
				    "cell_type": "markdown",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				-    "If you compare the generation result for both text and chat completion API calls, you will notice that:  \n",
			
 
				-    "\n",
			
 
				-    "* Text completion returns a list of `choices` for the input prompt, each contains generated text and completion information such as `logprobs`.\n",
			
 
				-    "* Chat completion returns a list of `choices` each with a `message` object with completion result, matching the `messages` object in the request.  \n",
			
 
				-    "\n",
			
 
				-    "\n"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				     "### Streaming\n",
			
 
				     "\n",
			
 
				     "One fantastic feature the API offers is the streaming capability.  \n",
			
@@ -147,7 +125,7 @@
 
				    "source": [
			
 
				     "### Content Safety Filtering\n",
			
 
				     "\n",
			
 
				-    "All Azure Llama 2 API endpoints have content safety feature turned on. Both input prompt and output tokens are filtered by this service automatically.  \n",
			
 
				+    "All Azure Llama 3 API endpoints have content safety feature turned on. Both input prompt and output tokens are filtered by this service automatically.  \n",
			
 
				     "To know more about the impact to the request/response payload, please refer to official guide [here](https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/content-filter?tabs=python).   \n",
			
 
				     "\n",
			
 
				     "For model input and output, if the filter detects there is harmful content, the generation will error out with a response payload containing the reasoning, along with information on the type of content violation and its severity. \n",
			
@@ -172,7 +150,7 @@
 
				     "\n",
			
 
				     "Besides calling the API directly from command line tools, you can also programatically call them in Python.  \n",
			
 
				     "\n",
			
 
				-    "Here is an example for the text completion model:\n",
			
 
				+    "Here is an example for the instruct model:\n",
			
 
				     "\n",
			
 
				     "\n"
			
 
				    ]
			
@@ -187,53 +165,6 @@
 
				     "import json\n",
			
 
				     "\n",
			
 
				     "#Configure payload data sending to API endpoint\n",
			
 
				-    "data = {\"prompt\": \"Math is a\", \n",
			
 
				-    "         \"max_tokens\": 30, \n",
			
 
				-    "         \"temperature\": 0.7,\n",
			
 
				-    "         \"top_p\": 0.9,      \n",
			
 
				-    "}\n",
			
 
				-    "\n",
			
 
				-    "body = str.encode(json.dumps(data))\n",
			
 
				-    "\n",
			
 
				-    "#Replace the url with your API endpoint\n",
			
 
				-    "url = 'https://your-endpoint.inference.ai.azure.com/v1/completions'\n",
			
 
				-    "\n",
			
 
				-    "#Replace this with the key for the endpoint\n",
			
 
				-    "api_key = 'your-auth-key'\n",
			
 
				-    "if not api_key:\n",
			
 
				-    "    raise Exception(\"API Key is missing\")\n",
			
 
				-    "\n",
			
 
				-    "headers = {'Content-Type':'application/json', 'Authorization':(api_key)}\n",
			
 
				-    "req = urllib.request.Request(url, body, headers)\n",
			
 
				-    "\n",
			
 
				-    "try:\n",
			
 
				-    "    response = urllib.request.urlopen(req)\n",
			
 
				-    "    result = response.read()\n",
			
 
				-    "    print(result)\n",
			
 
				-    "except urllib.error.HTTPError as error:\n",
			
 
				-    "    print(\"The request failed with status code: \" + str(error.code))\n",
			
 
				-    "    # Print the headers - they include the requert ID and the timestamp, which are useful for debugging the failure\n",
			
 
				-    "    print(error.info())\n",
			
 
				-    "    print(error.read().decode(\"utf8\", 'ignore'))\n"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "Chat completion in Python is very similar, here is a quick example:"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "import urllib.request\n",
			
 
				-    "import json\n",
			
 
				-    "\n",
			
 
				-    "#Configure payload data sending to API endpoint\n",
			
 
				     "data = {\"messages\":[\n",
			
 
				     "            {\"role\":\"system\", \"content\":\"You are a helpful assistant.\"},\n",
			
 
				     "            {\"role\":\"user\", \"content\":\"Who wrote the book Innovators dilemma?\"}], \n",
			
@@ -323,14 +254,12 @@
 
				    "cell_type": "markdown",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				-    "## Use Llama 2 API with LangChain\n",
			
 
				+    "## Use Llama 3 API with LangChain\n",
			
 
				     "\n",
			
 
				-    "In this section, we will demonstrate how to use Llama 2 APIs with LangChain, one of the most popular framework to accelerate building your AI product.  \n",
			
 
				+    "In this section, we will demonstrate how to use Llama 3 APIs with LangChain, one of the most popular framework to accelerate building your AI product.  \n",
			
 
				     "One common solution here is to create your customized LLM instance, so you can add it to various chains to complete different tasks.  \n",
			
 
				     "In this example, we will use the `AzureMLOnlineEndpoint` class LangChain provides to build a customized LLM instance. This particular class is designed to take in Azure endpoint and API keys as inputs and wire it with HTTP calls. So the underlying of it is very similar to how we used `urllib.request` library to send RESTful calls in previous examples to the Azure Endpoint.   \n",
			
 
				     "\n",
			
 
				-    "Note Azure is working on a standard solution for LangChain integration in this [PR](https://github.com/langchain-ai/langchain/pull/14560), you should consider migrating to that in the future. \n",
			
 
				-    "\n",
			
 
				     "First, let's install dependencies: \n",
			
 
				     "\n"
			
 
				    ]
			
@@ -363,7 +292,7 @@
 
				     "\n",
			
 
				     "\n",
			
 
				     "class AzureLlamaAPIContentFormatter(ContentFormatterBase):\n",
			
 
				-    "#Content formatter for Llama 2 API for Azure MaaS\n",
			
 
				+    "#Content formatter for Llama 3 API for Azure MaaS\n",
			
 
				     "\n",
			
 
				     "    def format_request_payload(self, prompt: str, model_kwargs: Dict) -> bytes:\n",
			
 
				     "        #Formats the request according to the chosen api\n",
			
@@ -450,18 +379,11 @@
 
				    "cell_type": "markdown",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				-    "At the time of writing this sample notebook, LangChain doesn't support streaming with `AzureMLOnlineEndpoint` for Llama 2. We are working with LangChain and Azure team to implement that."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "## Build a chatbot with Llama 2 API\n",
			
 
				+    "## Build a chatbot with Llama 3 API\n",
			
 
				     "\n",
			
 
				-    "In this section, we will build a simple chatbot using Azure Llama 2 API, LangChain and [Gradio](https://www.gradio.app/)'s `ChatInterface` with memory capability.\n",
			
 
				+    "In this section, we will build a simple chatbot using Azure Llama 3 API, LangChain and [Gradio](https://www.gradio.app/)'s `ChatInterface` with memory capability.\n",
			
 
				     "\n",
			
 
				-    "Gradio is a framework to help demo your machine learning model with a web interface. We also have a dedicated Gradio chatbot [example](https://github.com/meta-llama/llama-recipes/blob/main/recipes/use_cases/chatbots/RAG_chatbot/RAG_Chatbot_Example.ipynb) built with Llama 2 on-premises with RAG.   \n",
			
 
				+    "Gradio is a framework to help demo your machine learning model with a web interface. We also have a dedicated Gradio chatbot [example](https://github.com/meta-llama/llama-recipes/blob/main/recipes/use_cases/chatbots/RAG_chatbot/RAG_Chatbot_Example.ipynb) built with Llama 3 on-premises with RAG.   \n",
			
 
				     "\n",
			
 
				     "First, let's install Gradio dependencies.\n"
			
 
				    ]
			
@@ -508,7 +430,7 @@
 
				     "langchain.debug=True\n",
			
 
				     "\n",
			
 
				     "class AzureLlamaAPIContentFormatter(ContentFormatterBase):\n",
			
 
				-    "#Content formatter for Llama 2 API for Azure MaaS\n",
			
 
				+    "#Content formatter for Llama 3 API for Azure MaaS\n",
			
 
				     "\n",
			
 
				     "    def format_request_payload(self, prompt: str, model_kwargs: Dict) -> bytes:\n",
			
 
				     "        #Formats the request according to the chosen api\n",
			
@@ -602,7 +524,7 @@
 
				    "name": "python",
			
 
				    "nbconvert_exporter": "python",
			
 
				    "pygments_lexer": "ipython3",
			
 
				-   "version": "3.10.10"
			
 
				+   "version": "3.9.6"
			
 
				   }
			
 
				  },
			
 
				  "nbformat": 4,
			
--- a/recipes/llama_api_providers/OctoAI_API_examples/Getting_to_know_Llama.ipynb
+++ b/recipes/llama_api_providers/OctoAI_API_examples/Getting_to_know_Llama.ipynb
@@ -6,8 +6,43 @@
 
				     "id": "LERqQn5v8-ak"
			
 
				    },
			
 
				    "source": [
			
 
				-    "# **Getting to know Llama 2: Everything you need to start building**\n",
			
 
				-    "Our goal in this session is to provide a guided tour of Llama 2, including understanding different Llama 2 models, how and where to access them, Generative AI and Chatbot architectures, prompt engineering, RAG (Retrieval Augmented Generation), Fine-tuning and more. All this is implemented with a starter code for you to take it and use it in your Llama 2 projects."
			
 
				+    "# **Getting to know Llama 3: Everything you need to start building**\n",
			
 
				+    "Our goal in this session is to provide a guided tour of Llama 3, including understanding different Llama 3 models, how and where to access them, Generative AI and Chatbot architectures, prompt engineering, RAG (Retrieval Augmented Generation), Fine-tuning and more. All this is implemented with a starter code for you to take it and use it in your Llama 3 projects."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {
			
 
				+    "id": "h3YGMDJidHtH"
			
 
				+   },
			
 
				+   "source": [
			
 
				+    "### **Install dependencies**"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "metadata": {
			
 
				+    "id": "VhN6hXwx7FCp"
			
 
				+   },
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# Install dependencies and initialize\n",
			
 
				+    "%pip install \\\n",
			
 
				+    "    langchain==0.1.19 \\\n",
			
 
				+    "    matplotlib \\\n",
			
 
				+    "    octoai-sdk==0.10.1 \\\n",
			
 
				+    "    openai \\\n",
			
 
				+    "    sentence_transformers \\\n",
			
 
				+    "    pdf2image \\\n",
			
 
				+    "    pdfminer \\\n",
			
 
				+    "    pdfminer.six \\\n",
			
 
				+    "    unstructured \\\n",
			
 
				+    "    faiss-cpu \\\n",
			
 
				+    "    pillow-heif \\\n",
			
 
				+    "    opencv-python \\\n",
			
 
				+    "    unstructured-inference \\\n",
			
 
				+    "    pikepdf"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -58,7 +93,7 @@
 
				     "    A[Users] --> B(Applications e.g. mobile, web)\n",
			
 
				     "    B --> |Hosted API|C(Platforms e.g. Custom, OctoAI, HuggingFace, Replicate)\n",
			
 
				     "    B -- optional --> E(Frameworks e.g. LangChain)\n",
			
 
				-    "    C-->|User Input|D[Llama 2]\n",
			
 
				+    "    C-->|User Input|D[Llama 3]\n",
			
 
				     "    D-->|Model Output|C\n",
			
 
				     "    E --> C\n",
			
 
				     "    classDef default fill:#CCE6FF,stroke:#84BCF5,textColor:#1C2B33,fontFamily:trebuchet ms;\n",
			
@@ -69,19 +104,15 @@
 
				     "  flowchart TD\n",
			
 
				     "    A[User Prompts] --> B(Frameworks e.g. LangChain)\n",
			
 
				     "    B <--> |Database, Docs, XLS|C[fa:fa-database External Data]\n",
			
 
				-    "    B -->|API|D[Llama 2]\n",
			
 
				+    "    B -->|API|D[Llama 3]\n",
			
 
				     "    classDef default fill:#CCE6FF,stroke:#84BCF5,textColor:#1C2B33,fontFamily:trebuchet ms;\n",
			
 
				     "  \"\"\")\n",
			
 
				     "\n",
			
 
				-    "def llama2_family():\n",
			
 
				+    "def llama3_family():\n",
			
 
				     "  mm(\"\"\"\n",
			
 
				     "  graph LR;\n",
			
 
				-    "      llama-2 --> llama-2-7b\n",
			
 
				-    "      llama-2 --> llama-2-13b\n",
			
 
				-    "      llama-2 --> llama-2-70b\n",
			
 
				-    "      llama-2-7b --> llama-2-7b-chat\n",
			
 
				-    "      llama-2-13b --> llama-2-13b-chat\n",
			
 
				-    "      llama-2-70b --> llama-2-70b-chat\n",
			
 
				+    "      llama-3 --> llama-3-8b-instruct\n",
			
 
				+    "      llama-3 --> llama-3-70b-instruct\n",
			
 
				     "      classDef default fill:#CCE6FF,stroke:#84BCF5,textColor:#1C2B33,fontFamily:trebuchet ms;\n",
			
 
				     "  \"\"\")\n",
			
 
				     "\n",
			
@@ -91,7 +122,7 @@
 
				     "    users --> apps\n",
			
 
				     "    apps --> frameworks\n",
			
 
				     "    frameworks --> platforms\n",
			
 
				-    "    platforms --> Llama 2\n",
			
 
				+    "    platforms --> Llama 3\n",
			
 
				     "    classDef default fill:#CCE6FF,stroke:#84BCF5,textColor:#1C2B33,fontFamily:trebuchet ms;\n",
			
 
				     "  \"\"\")\n",
			
 
				     "\n",
			
@@ -115,8 +146,8 @@
 
				     "  user --> prompt\n",
			
 
				     "  prompt --> i_safety\n",
			
 
				     "  i_safety --> context\n",
			
 
				-    "  context --> Llama_2\n",
			
 
				-    "  Llama_2 --> output\n",
			
 
				+    "  context --> Llama_3\n",
			
 
				+    "  Llama_3 --> output\n",
			
 
				     "  output --> o_safety\n",
			
 
				     "  i_safety --> memory\n",
			
 
				     "  o_safety --> memory\n",
			
@@ -165,7 +196,7 @@
 
				     "id": "i4Np_l_KtIno"
			
 
				    },
			
 
				    "source": [
			
 
				-    "##**1 - Understanding Llama 2**"
			
 
				+    "##**1 - Understanding Llama 3**"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -174,14 +205,13 @@
 
				     "id": "PGPSI3M5PGTi"
			
 
				    },
			
 
				    "source": [
			
 
				-    "### **1.1 - What is Llama 2?**\n",
			
 
				+    "### **1.1 - What is Llama 3?**\n",
			
 
				     "\n",
			
 
				     "* State of the art (SOTA), Open Source LLM\n",
			
 
				-    "* 7B, 13B, 70B\n",
			
 
				+    "* Llama 3 8B, 70B\n",
			
 
				     "* Pretrained + Chat\n",
			
 
				     "* Choosing model: Size, Quality, Cost, Speed\n",
			
 
				-    "* [Research paper](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/)\n",
			
 
				-    "\n",
			
 
				+    "* [Llama 3 blog](https://ai.meta.com/blog/meta-llama-3/)\n",
			
 
				     "* [Responsible use guide](https://ai.meta.com/llama/responsible-use-guide/)"
			
 
				    ]
			
 
				   },
			
@@ -208,7 +238,7 @@
 
				    },
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "llama2_family()"
			
 
				+    "llama3_family()"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -217,11 +247,10 @@
 
				     "id": "aYeHVVh45bdT"
			
 
				    },
			
 
				    "source": [
			
 
				-    "###**1.2 - Accessing Llama 2**\n",
			
 
				+    "###**1.2 - Accessing Llama 3**\n",
			
 
				     "* Download + Self Host (on-premise)\n",
			
 
				     "* Hosted API Platform (e.g. [OctoAI](https://octoai.cloud/), [Replicate](https://replicate.com/meta))\n",
			
 
				-    "* Hosted Container Platform (e.g. [Azure](https://techcommunity.microsoft.com/t5/ai-machine-learning-blog/introducing-llama-2-on-azure/ba-p/3881233), [AWS](https://aws.amazon.com/blogs/machine-learning/llama-2-foundation-models-from-meta-are-now-available-in-amazon-sagemaker-jumpstart/), [GCP](https://console.cloud.google.com/vertex-ai/publishers/google/model-garden/139))\n",
			
 
				-    "\n"
			
 
				+    "* Hosted Container Platform (e.g. [Azure](https://techcommunity.microsoft.com/t5/ai-machine-learning-blog/introducing-llama-2-on-azure/ba-p/3881233), [AWS](https://aws.amazon.com/blogs/machine-learning/llama-2-foundation-models-from-meta-are-now-available-in-amazon-sagemaker-jumpstart/), [GCP](https://console.cloud.google.com/vertex-ai/publishers/google/model-garden/139))"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -230,7 +259,7 @@
 
				     "id": "kBuSay8vtzL4"
			
 
				    },
			
 
				    "source": [
			
 
				-    "### **1.3 - Use Cases of Llama 2**\n",
			
 
				+    "### **1.3 - Use Cases of Llama 3**\n",
			
 
				     "* Content Generation\n",
			
 
				     "* Chatbots\n",
			
 
				     "* Summarization\n",
			
@@ -245,42 +274,9 @@
 
				     "id": "sd54g0OHuqBY"
			
 
				    },
			
 
				    "source": [
			
 
				-    "##**2 - Using Llama 2**\n",
			
 
				+    "##**2 - Using Llama 3**\n",
			
 
				     "\n",
			
 
				-    "In this notebook, we are going to access [Llama 13b chat model](https://octoai.cloud/tools/text/chat?mode=demo&model=llama-2-13b-chat-fp16) using hosted API from OctoAI."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "metadata": {
			
 
				-    "id": "h3YGMDJidHtH"
			
 
				-   },
			
 
				-   "source": [
			
 
				-    "### **2.1 - Install dependencies**"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "metadata": {
			
 
				-    "id": "VhN6hXwx7FCp"
			
 
				-   },
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# Install dependencies and initialize\n",
			
 
				-    "%pip install -qU \\\n",
			
 
				-    "    octoai-sdk \\\n",
			
 
				-    "    langchain \\\n",
			
 
				-    "    sentence_transformers \\\n",
			
 
				-    "    pdf2image \\\n",
			
 
				-    "    pdfminer \\\n",
			
 
				-    "    pdfminer.six \\\n",
			
 
				-    "    unstructured \\\n",
			
 
				-    "    faiss-cpu \\\n",
			
 
				-    "    pillow-heif \\\n",
			
 
				-    "    opencv-python \\\n",
			
 
				-    "    unstructured-inference \\\n",
			
 
				-    "    pikepdf"
			
 
				+    "In this notebook, we are going to access [Llama 3 8b instruct model](https://octoai.cloud/text/chat?model=meta-llama-3-8b-instruct&mode=api) using hosted API from OctoAI."
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -292,9 +288,9 @@
 
				    "outputs": [],
			
 
				    "source": [
			
 
				     "# model on OctoAI platform that we will use for inferencing\n",
			
 
				-    "# We will use llama 13b chat model hosted on OctoAI server ()\n",
			
 
				+    "# We will use llama 3 8b instruct model hosted on OctoAI server\n",
			
 
				     "\n",
			
 
				-    "llama2_13b = \"llama-2-13b-chat-fp16\""
			
 
				+    "llama3_8b = \"meta-llama-3-8b-instruct\""
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -326,21 +322,21 @@
 
				    },
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "# we will use OctoAI's hosted API\n",
			
 
				-    "from octoai.client import Client\n",
			
 
				+    "# We will use OpenAI's APIs to talk to OctoAI's hosted model endpoint\n",
			
 
				+    "from openai import OpenAI\n",
			
 
				     "\n",
			
 
				-    "client = Client(OCTOAI_API_TOKEN)\n",
			
 
				+    "client = OpenAI(\n",
			
 
				+    "   base_url = \"https://text.octoai.run/v1\",\n",
			
 
				+    "   api_key = os.environ[\"OCTOAI_API_TOKEN\"]\n",
			
 
				+    ")\n",
			
 
				     "\n",
			
 
				     "# text completion with input prompt\n",
			
 
				     "def Completion(prompt):\n",
			
 
				     "    output = client.chat.completions.create(\n",
			
 
				     "        messages=[\n",
			
 
				-    "            {\n",
			
 
				-    "                \"role\": \"user\",\n",
			
 
				-    "                \"content\": prompt\n",
			
 
				-    "            }\n",
			
 
				+    "            {\"role\": \"user\", \"content\": prompt}\n",
			
 
				     "        ],\n",
			
 
				-    "        model=\"llama-2-13b-chat-fp16\",\n",
			
 
				+    "        model=llama3_8b,\n",
			
 
				     "        max_tokens=1000\n",
			
 
				     "    )\n",
			
 
				     "    return output.choices[0].message.content\n",
			
@@ -349,16 +345,10 @@
 
				     "def ChatCompletion(prompt, system_prompt=None):\n",
			
 
				     "    output = client.chat.completions.create(\n",
			
 
				     "        messages=[\n",
			
 
				-    "            {\n",
			
 
				-    "                \"role\": \"system\",\n",
			
 
				-    "                \"content\": system_prompt\n",
			
 
				-    "            },\n",
			
 
				-    "            {\n",
			
 
				-    "                \"role\": \"user\",\n",
			
 
				-    "                \"content\": prompt\n",
			
 
				-    "            }\n",
			
 
				+    "            {\"role\": \"system\", \"content\": system_prompt},\n",
			
 
				+    "            {\"role\": \"user\", \"content\": prompt}\n",
			
 
				     "        ],\n",
			
 
				-    "        model=\"llama-2-13b-chat-fp16\",\n",
			
 
				+    "        model=llama3_8b,\n",
			
 
				     "        max_tokens=1000\n",
			
 
				     "    )\n",
			
 
				     "    return output.choices[0].message.content"
			
@@ -370,7 +360,7 @@
 
				     "id": "5Jxq0pmf6L73"
			
 
				    },
			
 
				    "source": [
			
 
				-    "### **2.2 - Basic completion**"
			
 
				+    "# **2.1 - Basic completion**"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -391,7 +381,7 @@
 
				     "id": "StccjUDh6W0Q"
			
 
				    },
			
 
				    "source": [
			
 
				-    "### **2.3 - System prompts**\n"
			
 
				+    "## **2.2 - System prompts**\n"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -415,7 +405,7 @@
 
				     "id": "Hp4GNa066pYy"
			
 
				    },
			
 
				    "source": [
			
 
				-    "### **2.4 - Response formats**\n",
			
 
				+    "### **2.3 - Response formats**\n",
			
 
				     "* Can support different formatted outputs e.g. text, JSON, etc."
			
 
				    ]
			
 
				   },
			
@@ -483,7 +473,7 @@
 
				     "\n",
			
 
				     "* User Prompts\n",
			
 
				     "* Input Safety\n",
			
 
				-    "* Llama 2\n",
			
 
				+    "* Llama 3\n",
			
 
				     "* Output Safety\n",
			
 
				     "\n",
			
 
				     "* Memory & Context"
			
@@ -743,12 +733,9 @@
 
				     "### **4.3 - Retrieval Augmented Generation (RAG)**\n",
			
 
				     "* Prompt Eng Limitations - Knowledge cutoff & lack of specialized data\n",
			
 
				     "\n",
			
 
				-    "* Retrieval Augmented Generation(RAG) allows us to retrieve snippets of information from external data sources and augment it to the user's prompt to get tailored responses from Llama 2.\n",
			
 
				-    "\n",
			
 
				-    "For our demo, we are going to download an external PDF file from a URL and query against the content in the pdf file to get contextually relevant information back with the help of Llama!\n",
			
 
				+    "* Retrieval Augmented Generation(RAG) allows us to retrieve snippets of information from external data sources and augment it to the user's prompt to get tailored responses from Llama 3.\n",
			
 
				     "\n",
			
 
				-    "\n",
			
 
				-    "\n"
			
 
				+    "For our demo, we are going to download an external PDF file from a URL and query against the content in the pdf file to get contextually relevant information back with the help of Llama!"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -797,24 +784,16 @@
 
				    "source": [
			
 
				     "# langchain setup\n",
			
 
				     "from langchain.llms.octoai_endpoint import OctoAIEndpoint\n",
			
 
				-    "# Use the Llama 2 model hosted on OctoAI\n",
			
 
				-    "# Temperature: Adjusts randomness of outputs, greater than 1 is random and 0 is deterministic, 0.75 is a good starting value\n",
			
 
				+    "\n",
			
 
				+    "# Use the Llama 3 model hosted on OctoAI\n",
			
 
				+    "# max_tokens: Maximum number of tokens to generate. A word is generally 2-3 tokens\n",
			
 
				+    "# temperature: Adjusts randomness of outputs, greater than 1 is random and 0 is deterministic, 0.75 is a good starting value\n",
			
 
				     "# top_p: When decoding text, samples from the top p percentage of most likely tokens; lower to ignore less likely tokens\n",
			
 
				-    "# max_new_tokens: Maximum number of tokens to generate. A word is generally 2-3 tokens\n",
			
 
				     "llama_model = OctoAIEndpoint(\n",
			
 
				-    "    endpoint_url=\"https://text.octoai.run/v1/chat/completions\",\n",
			
 
				-    "    model_kwargs={\n",
			
 
				-    "        \"model\": llama2_13b,\n",
			
 
				-    "        \"messages\": [\n",
			
 
				-    "            {\n",
			
 
				-    "                \"role\": \"system\",\n",
			
 
				-    "                \"content\": \"You are a helpful, respectful and honest assistant.\"\n",
			
 
				-    "            }\n",
			
 
				-    "        ],\n",
			
 
				-    "        \"max_tokens\": 1000,\n",
			
 
				-    "        \"top_p\": 1,\n",
			
 
				-    "        \"temperature\": 0.75\n",
			
 
				-    "    },\n",
			
 
				+    "    model=llama3_8b,\n",
			
 
				+    "    max_tokens=1000,\n",
			
 
				+    "    temperature=0.75,\n",
			
 
				+    "    top_p=1\n",
			
 
				     ")"
			
 
				    ]
			
 
				   },
			
@@ -973,10 +952,11 @@
 
				    },
			
 
				    "source": [
			
 
				     "#### **Resources**\n",
			
 
				-    "- [GitHub - Llama 2](https://github.com/facebookresearch/llama)\n",
			
 
				-    "- [Github - LLama 2 Recipes](https://github.com/facebookresearch/llama-recipes)\n",
			
 
				-    "- [Llama 2](https://ai.meta.com/llama/)\n",
			
 
				-    "- [Research Paper](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/)\n",
			
 
				+    "- [GitHub - Llama](https://github.com/facebookresearch/llama)\n",
			
 
				+    "- [Github - LLama Recipes](https://github.com/facebookresearch/llama-recipes)\n",
			
 
				+    "- [Llama](https://ai.meta.com/llama/)\n",
			
 
				+    "- [Research Paper on Llama 2](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/)\n",
			
 
				+    "- [Llama 3 Page](https://ai.meta.com/blog/meta-llama-3/)\n",
			
 
				     "- [Model Card](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md)\n",
			
 
				     "- [Responsible Use Guide](https://ai.meta.com/llama/responsible-use-guide/)\n",
			
 
				     "- [Acceptable Use Policy](https://ai.meta.com/llama/use-policy/)\n",
			
@@ -992,9 +972,9 @@
 
				    "source": [
			
 
				     "#### **Authors & Contact**\n",
			
 
				     "  * asangani@meta.com, [Amit Sangani | LinkedIn](https://www.linkedin.com/in/amitsangani/)\n",
			
 
				-    "  * mohsena@meta.com, [Mohsen Agsen | LinkedIn](https://www.linkedin.com/in/mohsen-agsen-62a9791/)\n",
			
 
				+    "  * mohsena@meta.com, [Mohsen Agsen | LinkedIn](https://www.linkedin.com/in/dr-thierry-moreau/)\n",
			
 
				     "\n",
			
 
				-    "Adapted to run on OctoAI by Thierry Moreau - tmoreau@octo.ai"
			
 
				+    "Adapted to run on OctoAI and use Llama 3 by tmoreau@octo.ai [Thierry Moreay | LinkedIn]()"
			
 
				    ]
			
 
				   }
			
 
				  ],
			
--- a/recipes/llama_api_providers/OctoAI_API_examples/HelloLlamaCloud.ipynb
+++ b/recipes/llama_api_providers/OctoAI_API_examples/HelloLlamaCloud.ipynb
@@ -6,13 +6,12 @@
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "## This demo app shows:\n",
			
 
				-    "* How to run Llama2 in the cloud hosted on OctoAI\n",
			
 
				+    "* How to run Llama 3 in the cloud hosted on OctoAI\n",
			
 
				     "* How to use LangChain to ask Llama general questions and follow up questions\n",
			
 
				-    "* How to use LangChain to load a recent PDF doc - the Llama2 paper pdf - and chat about it. This is the well known RAG (Retrieval Augmented Generation) method to let LLM such as Llama2 be able to answer questions about the data not publicly available when Llama2 was trained, or about your own data. RAG is one way to prevent LLM's hallucination\n",
			
 
				-    "* You should also review the [HelloLlamaLocal](HelloLlamaLocal.ipynb) notebook for more information on RAG\n",
			
 
				+    "* How to use LangChain to load a recent PDF doc - the Llama paper pdf - and chat about it. This is the well known RAG (Retrieval Augmented Generation) method to let LLM such as Llama be able to answer questions about your own data. RAG is one way to prevent LLM's hallucination\n",
			
 
				     "\n",
			
 
				     "**Note** We will be using OctoAI to run the examples here. You will need to first sign into [OctoAI](https://octoai.cloud/) with your Github or Google account, then create a free API token [here](https://octo.ai/docs/getting-started/how-to-create-an-octoai-access-token) that you can use for a while (a month or $10 in OctoAI credits, whichever one runs out first).\n",
			
 
				-    "After the free trial ends, you will need to enter billing info to continue to use Llama2 hosted on OctoAI."
			
 
				+    "After the free trial ends, you will need to enter billing info to continue to use Llama 3 hosted on OctoAI."
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -35,7 +34,7 @@
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "!pip install langchain octoai-sdk sentence-transformers chromadb pypdf"
			
 
				+    "%pip install langchain==0.1.19 octoai-sdk==0.10.1 openai sentence-transformers chromadb pypdf"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -57,15 +56,17 @@
 
				    "id": "3e8870c1",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				-    "Next we call the Llama 2 model from OctoAI. In this example we will use the Llama 2 13b chat FP16 model. You can find more on Llama 2 models on the [OctoAI text generation solution page](https://octoai.cloud/tools/text).\n",
			
 
				+    "Next we call the Llama 3 model from OctoAI. In this example we will use the Llama 3 8b instruct model. You can find more on Llama models on the [OctoAI text generation solution page](https://octoai.cloud/text).\n",
			
 
				     "\n",
			
 
				     "At the time of writing this notebook the following Llama models are available on OctoAI:\n",
			
 
				-    "* llama-2-13b-chat\n",
			
 
				-    "* llama-2-70b-chat\n",
			
 
				+    "* meta-llama-3-8b-instruct\n",
			
 
				+    "* meta-llama-3-70b-instruct\n",
			
 
				     "* codellama-7b-instruct\n",
			
 
				     "* codellama-13b-instruct\n",
			
 
				     "* codellama-34b-instruct\n",
			
 
				-    "* codellama-70b-instruct"
			
 
				+    "* llama-2-13b-chat\n",
			
 
				+    "* llama-2-70b-chat\n",
			
 
				+    "* llamaguard-7b"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -77,21 +78,11 @@
 
				    "source": [
			
 
				     "from langchain.llms.octoai_endpoint import OctoAIEndpoint\n",
			
 
				     "\n",
			
 
				-    "llama2_13b = \"llama-2-13b-chat-fp16\"\n",
			
 
				+    "llama3_8b = \"meta-llama-3-8b-instruct\"\n",
			
 
				     "llm = OctoAIEndpoint(\n",
			
 
				-    "    endpoint_url=\"https://text.octoai.run/v1/chat/completions\",\n",
			
 
				-    "    model_kwargs={\n",
			
 
				-    "        \"model\": llama2_13b,\n",
			
 
				-    "        \"messages\": [\n",
			
 
				-    "            {\n",
			
 
				-    "                \"role\": \"system\",\n",
			
 
				-    "                \"content\": \"You are a helpful, respectful and honest assistant.\"\n",
			
 
				-    "            }\n",
			
 
				-    "        ],\n",
			
 
				-    "        \"max_tokens\": 500,\n",
			
 
				-    "        \"top_p\": 1,\n",
			
 
				-    "        \"temperature\": 0.01\n",
			
 
				-    "    },\n",
			
 
				+    "    model=llama3_8b,\n",
			
 
				+    "    max_tokens=500,\n",
			
 
				+    "    temperature=0.01\n",
			
 
				     ")"
			
 
				    ]
			
 
				   },
			
@@ -111,7 +102,7 @@
 
				    "outputs": [],
			
 
				    "source": [
			
 
				     "question = \"who wrote the book Innovator's dilemma?\"\n",
			
 
				-    "answer = llm(question)\n",
			
 
				+    "answer = llm.invoke(question)\n",
			
 
				     "print(answer)"
			
 
				    ]
			
 
				   },
			
@@ -134,7 +125,7 @@
 
				    "source": [
			
 
				     "# chat history not passed so Llama doesn't have the context and doesn't know this is more about the book\n",
			
 
				     "followup = \"tell me more\"\n",
			
 
				-    "followup_answer = llm(followup)\n",
			
 
				+    "followup_answer = llm.invoke(followup)\n",
			
 
				     "print(followup_answer)"
			
 
				    ]
			
 
				   },
			
@@ -162,7 +153,7 @@
 
				     "memory = ConversationBufferMemory()\n",
			
 
				     "conversation = ConversationChain(\n",
			
 
				     "    llm=llm, \n",
			
 
				-    "    memory = memory,\n",
			
 
				+    "    memory=memory,\n",
			
 
				     "    verbose=False\n",
			
 
				     ")"
			
 
				    ]
			
@@ -208,11 +199,10 @@
 
				    "id": "fc436163",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				-    "Next, let's explore using Llama 2 to answer questions using documents for context. \n",
			
 
				-    "This gives us the ability to update Llama 2's knowledge thus giving it better context without needing to finetune. \n",
			
 
				-    "For a more in-depth study of this, see the notebook on using Llama 2 locally [here](HelloLlamaLocal.ipynb)\n",
			
 
				+    "Next, let's explore using Llama 3 to answer questions using documents for context. \n",
			
 
				+    "This gives us the ability to update Llama 3's knowledge thus giving it better context without needing to finetune. \n",
			
 
				     "\n",
			
 
				-    "We will use the PyPDFLoader to load in a pdf, in this case, the Llama 2 paper."
			
 
				+    "We will use the PyPDFLoader to load in a pdf, in this case, the Llama paper."
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -301,7 +291,7 @@
 
				    "id": "54ad02d7",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				-    "We then use ` RetrievalQA` to retrieve the documents from the vector database and give the model more context on Llama 2, thereby increasing its knowledge.\n",
			
 
				+    "We then use ` RetrievalQA` to retrieve the documents from the vector database and give the model more context on Llama, thereby increasing its knowledge.\n",
			
 
				     "\n",
			
 
				     "For each question, LangChain performs a semantic similarity search of it in the vector db, then passes the search results as the context to Llama to answer the question."
			
 
				    ]
			
@@ -321,7 +311,7 @@
 
				     "    retriever=vectordb.as_retriever()\n",
			
 
				     ")\n",
			
 
				     "\n",
			
 
				-    "question = \"What is llama2?\"\n",
			
 
				+    "question = \"What is llama?\"\n",
			
 
				     "result = qa_chain({\"query\": question})\n",
			
 
				     "print(result['result'])"
			
 
				    ]
			
@@ -344,7 +334,7 @@
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "# no context passed so Llama2 doesn't have enough context to answer so it lets its imagination go wild\n",
			
 
				+    "# no context passed so Llama doesn't have enough context to answer so it lets its imagination go wild\n",
			
 
				     "result = qa_chain({\"query\": \"what are its use cases?\"})\n",
			
 
				     "print(result['result'])"
			
 
				    ]
			
@@ -376,7 +366,7 @@
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "# let's ask the original question \"What is llama2?\" again\n",
			
 
				+    "# let's ask the original question \"What is llama?\" again\n",
			
 
				     "result = chat_chain({\"question\": question, \"chat_history\": []})\n",
			
 
				     "print(result['answer'])"
			
 
				    ]
			
--- a/recipes/llama_api_providers/OctoAI_API_examples/LiveData.ipynb
+++ b/recipes/llama_api_providers/OctoAI_API_examples/LiveData.ipynb
@@ -7,12 +7,12 @@
 
				    "source": [
			
 
				     "## This demo app shows:\n",
			
 
				     "* How to use LlamaIndex, an open source library to help you build custom data augmented LLM applications\n",
			
 
				-    "* How to ask Llama questions about recent live data via the You.com live search API and LlamaIndex\n",
			
 
				+    "* How to ask Llama 3 questions about recent live data via the Tavily live search API\n",
			
 
				     "\n",
			
 
				-    "The LangChain package is used to facilitate the call to Llama2 hosted on OctoAI\n",
			
 
				+    "The LangChain package is used to facilitate the call to Llama 3 hosted on OctoAI\n",
			
 
				     "\n",
			
 
				     "**Note** We will be using OctoAI to run the examples here. You will need to first sign into [OctoAI](https://octoai.cloud/) with your Github or Google account, then create a free API token [here](https://octo.ai/docs/getting-started/how-to-create-an-octoai-access-token) that you can use for a while (a month or $10 in OctoAI credits, whichever one runs out first).\n",
			
 
				-    "After the free trial ends, you will need to enter billing info to continue to use Llama2 hosted on OctoAI."
			
 
				+    "After the free trial ends, you will need to enter billing info to continue to use Llama3 hosted on OctoAI."
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -32,23 +32,13 @@
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "!pip install llama-index langchain"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "21fe3849",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# use ServiceContext to configure the LLM used and the custom embeddings\n",
			
 
				-    "from llama_index import ServiceContext\n",
			
 
				-    "\n",
			
 
				-    "# VectorStoreIndex is used to index custom data \n",
			
 
				-    "from llama_index import VectorStoreIndex\n",
			
 
				-    "\n",
			
 
				-    "from langchain.llms.octoai_endpoint import OctoAIEndpoint"
			
 
				+    "!pip install llama-index \n",
			
 
				+    "!pip install llama-index-core\n",
			
 
				+    "!pip install llama-index-llms-octoai\n",
			
 
				+    "!pip install llama-index-embeddings-octoai\n",
			
 
				+    "!pip install octoai-sdk\n",
			
 
				+    "!pip install tavily-python\n",
			
 
				+    "!pip install replicate"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -75,227 +65,161 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "f8ff812b",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "In this example we will use the [YOU.com](https://you.com/) search engine to augment the LLM's responses.\n",
			
 
				-    "To use the You.com Search API, you can email api@you.com to request an API key. "
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "75275628-5235-4b55-8033-601c76107528",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "YOUCOM_API_KEY = getpass()\n",
			
 
				-    "os.environ[\"YOUCOM_API_KEY\"] = YOUCOM_API_KEY"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				    "id": "cb210c7c",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				-    "We then call the Llama 2 model from OctoAI.\n",
			
 
				+    "We then call the Llama 3 model from OctoAI.\n",
			
 
				     "\n",
			
 
				-    "We will use the Llama 2 13b chat FP16 model. You can find more on Llama 2 models on the [OctoAI text generation solution page](https://octoai.cloud/tools/text).\n",
			
 
				+    "We will use the Llama 3 8b instruct model. You can find more on Llama models on the [OctoAI text generation solution page](https://octoai.cloud/text).\n",
			
 
				     "\n",
			
 
				     "At the time of writing this notebook the following Llama models are available on OctoAI:\n",
			
 
				-    "* llama-2-13b-chat\n",
			
 
				-    "* llama-2-70b-chat\n",
			
 
				+    "* meta-llama-3-8b-instruct\n",
			
 
				+    "* meta-llama-3-70b-instruct\n",
			
 
				     "* codellama-7b-instruct\n",
			
 
				     "* codellama-13b-instruct\n",
			
 
				     "* codellama-34b-instruct\n",
			
 
				-    "* codellama-70b-instruct"
			
 
				+    "* llama-2-13b-chat\n",
			
 
				+    "* llama-2-70b-chat\n",
			
 
				+    "* llamaguard-7b"
			
 
				    ]
			
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": null,
			
 
				-   "id": "c12fc2cb",
			
 
				+   "id": "21fe3849",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "# set llm to be using Llama2 hosted on OctoAI\n",
			
 
				-    "llama2_13b = \"llama-2-13b-chat-fp16\"\n",
			
 
				+    "# use ServiceContext to configure the LLM used and the custom embeddings\n",
			
 
				+    "from llama_index.core import ServiceContext\n",
			
 
				     "\n",
			
 
				-    "llm = OctoAIEndpoint(\n",
			
 
				-    "    endpoint_url=\"https://text.octoai.run/v1/chat/completions\",\n",
			
 
				-    "    model_kwargs={\n",
			
 
				-    "        \"model\": llama2_13b,\n",
			
 
				-    "        \"messages\": [\n",
			
 
				-    "            {\n",
			
 
				-    "                \"role\": \"system\",\n",
			
 
				-    "                \"content\": \"You are a helpful, respectful and honest assistant.\"\n",
			
 
				-    "            }\n",
			
 
				-    "        ],\n",
			
 
				-    "        \"max_tokens\": 500,\n",
			
 
				-    "        \"top_p\": 1,\n",
			
 
				-    "        \"temperature\": 0.01\n",
			
 
				-    "    },\n",
			
 
				-    ")"
			
 
				+    "# VectorStoreIndex is used to index custom data \n",
			
 
				+    "from llama_index.core import VectorStoreIndex\n",
			
 
				+    "\n",
			
 
				+    "from llama_index.core import Settings, VectorStoreIndex\n",
			
 
				+    "from llama_index.embeddings.octoai import OctoAIEmbedding\n",
			
 
				+    "from llama_index.llms.octoai import OctoAI\n",
			
 
				+    "\n",
			
 
				+    "Settings.llm = OctoAI(\n",
			
 
				+    "    model=\"meta-llama-3-8b-instruct\",\n",
			
 
				+    "    token=OCTOAI_API_TOKEN,\n",
			
 
				+    "    temperature=0.0,\n",
			
 
				+    "    max_tokens=128,\n",
			
 
				+    ")\n",
			
 
				+    "\n",
			
 
				+    "Settings.embed_model = OctoAIEmbedding(api_key=OCTOAI_API_TOKEN)"
			
 
				    ]
			
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "476d72da",
			
 
				+   "id": "f8ff812b",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				-    "Using our api key we set up earlier, we make a request from YOU.com for live data on a particular topic."
			
 
				+    "Next you will use the [Tavily](https://tavily.com/) search engine to augment the Llama 3's responses. To create a free trial Tavily Search API, sign in with your Google or Github account [here](https://app.tavily.com/sign-in)."
			
 
				    ]
			
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": null,
			
 
				-   "id": "effc9656-b18d-4d24-a80b-6066564a838b",
			
 
				+   "id": "75275628-5235-4b55-8033-601c76107528",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "import requests\n",
			
 
				+    "from tavily import TavilyClient\n",
			
 
				     "\n",
			
 
				-    "query = \"Meta Connect\" # you can try other live data query about sports score, stock market and weather info \n",
			
 
				-    "headers = {\"X-API-Key\": os.environ[\"YOUCOM_API_KEY\"]}\n",
			
 
				-    "data = requests.get(\n",
			
 
				-    "    f\"https://api.ydc-index.io/search?query={query}\",\n",
			
 
				-    "    headers=headers,\n",
			
 
				-    ").json()"
			
 
				+    "TAVILY_API_KEY = getpass()\n",
			
 
				+    "tavily = TavilyClient(api_key=TAVILY_API_KEY)"
			
 
				    ]
			
 
				   },
			
 
				   {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "8bed3baf-742e-473c-ada1-4459012a8a2c",
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "476d72da",
			
 
				    "metadata": {},
			
 
				-   "outputs": [],
			
 
				    "source": [
			
 
				-    "# check the query result in JSON\n",
			
 
				-    "import json\n",
			
 
				-    "\n",
			
 
				-    "print(json.dumps(data, indent=2))"
			
 
				+    "Do a live web search on \"Llama 3 fine-tuning\"."
			
 
				    ]
			
 
				   },
			
 
				   {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "b196e697",
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "effc9656-b18d-4d24-a80b-6066564a838b",
			
 
				    "metadata": {},
			
 
				+   "outputs": [],
			
 
				    "source": [
			
 
				-    "We then use the [`JSONLoader`](https://llamahub.ai/l/file-json) to extract the text from the returned data. The `JSONLoader` gives us the ability to load the data into LamaIndex.\n",
			
 
				-    "In the next cell we show how to load the JSON result with key info stored as \"snippets\".\n",
			
 
				-    "\n",
			
 
				-    "However, you can also add the snippets in the query result to documents like below:\n",
			
 
				-    "```python \n",
			
 
				-    "from llama_index import Document\n",
			
 
				-    "snippets = [snippet for hit in data[\"hits\"] for snippet in hit[\"snippets\"]]\n",
			
 
				-    "documents = [Document(text=s) for s in snippets]\n",
			
 
				-    "```\n",
			
 
				-    "This can be handy if you just need to add a list of text strings to doc"
			
 
				+    "response = tavily.search(query=\"Llama 3 fine-tuning\")\n",
			
 
				+    "context = [{\"url\": obj[\"url\"], \"content\": obj[\"content\"]} for obj in response['results']]"
			
 
				    ]
			
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": null,
			
 
				-   "id": "7c40e73f-ca13-4f4a-a753-e613df3d389e",
			
 
				+   "id": "6b5af98b-c26b-4fd7-8031-31ac4915cdac",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "# one way to load the JSON result with key info stored as \"snippets\"\n",
			
 
				-    "from llama_index import download_loader\n",
			
 
				-    "\n",
			
 
				-    "JsonDataReader = download_loader(\"JsonDataReader\")\n",
			
 
				-    "loader = JsonDataReader()\n",
			
 
				-    "documents = loader.load_data([hit[\"snippets\"] for hit in data[\"hits\"]])\n"
			
 
				+    "context"
			
 
				    ]
			
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "8e5e3b4e",
			
 
				+   "id": "0f4ea96b-bb00-4a1f-8bd2-7f15237415f6",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				-    "With the data set up, we create a vector store for the data and a query engine for it.\n",
			
 
				-    "\n",
			
 
				-    "For our embeddings we will use `OctoAIEmbeddings` whose default embedding model is GTE-Large. This model provides a good balance between speed and performance.\n",
			
 
				-    "\n",
			
 
				-    "For more info see https://octoai.cloud/tools/text/embeddings?mode=demo&model=thenlper%2Fgte-large. "
			
 
				+    "Create documents based on the search results, index and save them to a vector store, then create a query engine."
			
 
				    ]
			
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": null,
			
 
				-   "id": "a5de3080-2c4b-479c-baba-793b3bee36ed",
			
 
				+   "id": "7513ac70-155a-4d56-b326-0e8c2733ab99",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "# use OctoAI embeddings \n",
			
 
				-    "from langchain_community.embeddings import OctoAIEmbeddings\n",
			
 
				-    "from llama_index.embeddings import LangchainEmbedding\n",
			
 
				-    "\n",
			
 
				-    "\n",
			
 
				-    "embeddings = LangchainEmbedding(OctoAIEmbeddings(\n",
			
 
				-    "    endpoint_url=\"https://text.octoai.run/v1/embeddings\"\n",
			
 
				-    "))\n",
			
 
				-    "print(embeddings)\n",
			
 
				-    "\n",
			
 
				-    "# create a ServiceContext instance to use Llama2 and custom embeddings\n",
			
 
				-    "service_context = ServiceContext.from_defaults(llm=llm, chunk_size=800, chunk_overlap=20, embed_model=embeddings)\n",
			
 
				+    "from llama_index.core import Document\n",
			
 
				     "\n",
			
 
				-    "# create vector store index from the documents created above\n",
			
 
				-    "index = VectorStoreIndex.from_documents(documents, service_context=service_context)\n",
			
 
				+    "documents = [Document(text=ct['content']) for ct in context]\n",
			
 
				+    "index = VectorStoreIndex.from_documents(documents)\n",
			
 
				     "\n",
			
 
				-    "# create query engine from the index\n",
			
 
				-    "query_engine = index.as_query_engine(streaming=False)"
			
 
				+    "query_engine = index.as_query_engine(streaming=True)"
			
 
				    ]
			
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "2c4ea012",
			
 
				+   "id": "df743c62-165c-4834-b1f1-7d7848a6815e",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				-    "We are now ready to ask Llama 2 a question about the live data using our query engine."
			
 
				+    "You are now ready to ask Llama 3 questions about the live data using the query engine."
			
 
				    ]
			
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": null,
			
 
				-   "id": "de91a191-d0f2-498e-88dc-b2b43423e0e5",
			
 
				+   "id": "b2fd905b-575a-45f1-88da-9b093caa232a",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "# ask Llama2 a summary question about the search result\n",
			
 
				     "response = query_engine.query(\"give me a summary\")\n",
			
 
				-    "print(str(response))"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "72814b20-06aa-4da8-b4dd-f0b0d74a2ea0",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# more questions\n",
			
 
				-    "print(str(query_engine.query(\"what products were announced\")))"
			
 
				+    "response.print_response_stream()"
			
 
				    ]
			
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": null,
			
 
				-   "id": "a65bc037-a689-476d-b529-0059a27bc949",
			
 
				+   "id": "88c45380-1d00-46d5-80ac-0eff68fd1f8a",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "print(str(query_engine.query(\"tell me more about Meta AI assistant\")))"
			
 
				+    "query_engine.query(\"what's the latest about Llama 3 fine-tuning?\").print_response_stream()"
			
 
				    ]
			
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": null,
			
 
				-   "id": "16a56542",
			
 
				+   "id": "0fe54976-5345-4426-a6f0-dc3bfd45dac3",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "print(str(query_engine.query(\"what are Generative AI stickers\")))"
			
 
				+    "query_engine.query(\"tell me more about Llama 3 fine-tuning\").print_response_stream()"
			
 
				    ]
			
 
				   }
			
 
				  ],
			
--- a/recipes/llama_api_providers/OctoAI_API_examples/Llama2_Gradio.ipynb
+++ b/recipes/llama_api_providers/OctoAI_API_examples/Llama2_Gradio.ipynb
@@ -5,14 +5,14 @@
 
				    "id": "47a9adb3",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				-    "## This demo app shows how to query Llama 2 using the Gradio UI.\n",
			
 
				+    "## This demo app shows how to query Llama 3 using the Gradio UI.\n",
			
 
				     "\n",
			
 
				     "Since we are using OctoAI in this example, you'll need to obtain an OctoAI token:\n",
			
 
				     "\n",
			
 
				     "- You will need to first sign into [OctoAI](https://octoai.cloud/) with your Github or Google account\n",
			
 
				     "- Then create a free API token [here](https://octo.ai/docs/getting-started/how-to-create-an-octoai-access-token) that you can use for a while (a month or $10 in OctoAI credits, whichever one runs out first)\n",
			
 
				     "\n",
			
 
				-    "**Note** After the free trial ends, you will need to enter billing info to continue to use Llama2 hosted on OctoAI.\n",
			
 
				+    "**Note** After the free trial ends, you will need to enter billing info to continue to use Llama 3 hosted on OctoAI.\n",
			
 
				     "\n",
			
 
				     "To run this example:\n",
			
 
				     "- Run the notebook\n",
			
@@ -22,8 +22,7 @@
 
				     "In the notebook or a browser with URL http://127.0.0.1:7860 you should see a UI with your answer.\n",
			
 
				     "\n",
			
 
				     "Let's start by installing the necessary packages:\n",
			
 
				-    "- langchain provides necessary RAG tools for this demo\n",
			
 
				-    "- octoai-sdk allows us to use OctoAI Llama 2 endpoint\n",
			
 
				+    "- openai for us to use its APIs to talk to the OctoAI endpoint\n",
			
 
				     "- gradio is used for the UI elements\n",
			
 
				     "\n",
			
 
				     "And setting up the OctoAI token."
			
@@ -36,7 +35,7 @@
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "!pip install langchain octoai-sdk gradio"
			
 
				+    "!pip install openai gradio"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -60,37 +59,34 @@
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "from langchain.schema import AIMessage, HumanMessage\n",
			
 
				     "import gradio as gr\n",
			
 
				-    "from langchain.llms.octoai_endpoint import OctoAIEndpoint\n",
			
 
				+    "import openai\n",
			
 
				     "\n",
			
 
				-    "llama2_13b = \"llama-2-13b-chat-fp16\"\n",
			
 
				-    "\n",
			
 
				-    "llm = OctoAIEndpoint(\n",
			
 
				-    "    endpoint_url=\"https://text.octoai.run/v1/chat/completions\",\n",
			
 
				-    "    model_kwargs={\n",
			
 
				-    "        \"model\": llama2_13b,\n",
			
 
				-    "        \"messages\": [\n",
			
 
				-    "            {\n",
			
 
				-    "                \"role\": \"system\",\n",
			
 
				-    "                \"content\": \"You are a helpful, respectful and honest assistant.\"\n",
			
 
				-    "            }\n",
			
 
				-    "        ],\n",
			
 
				-    "        \"max_tokens\": 500,\n",
			
 
				-    "        \"top_p\": 1,\n",
			
 
				-    "        \"temperature\": 0.01\n",
			
 
				-    "    },\n",
			
 
				+    "# Init OctoAI client\n",
			
 
				+    "client = openai.OpenAI(\n",
			
 
				+    "    base_url=\"https://text.octoai.run/v1\",\n",
			
 
				+    "    api_key=os.environ[\"OCTOAI_API_TOKEN\"]\n",
			
 
				     ")\n",
			
 
				     "\n",
			
 
				-    "\n",
			
 
				     "def predict(message, history):\n",
			
 
				-    "    history_langchain_format = []\n",
			
 
				-    "    for human, ai in history:\n",
			
 
				-    "        history_langchain_format.append(HumanMessage(content=human))\n",
			
 
				-    "        history_langchain_format.append(AIMessage(content=ai))\n",
			
 
				-    "    history_langchain_format.append(HumanMessage(content=message))\n",
			
 
				-    "    llm_response = llm(message, history_langchain_format)\n",
			
 
				-    "    return llm_response.content\n",
			
 
				+    "    history_openai_format = []\n",
			
 
				+    "    for human, assistant in history:\n",
			
 
				+    "        history_openai_format.append({\"role\": \"user\", \"content\": human})\n",
			
 
				+    "        history_openai_format.append({\"role\": \"assistant\", \"content\": assistant})\n",
			
 
				+    "    history_openai_format.append({\"role\": \"user\", \"content\": message})\n",
			
 
				+    "\n",
			
 
				+    "    response = client.chat.completions.create(\n",
			
 
				+    "        model = 'meta-llama-3-70b-instruct',\n",
			
 
				+    "        messages = history_openai_format,\n",
			
 
				+    "        temperature = 0.0,\n",
			
 
				+    "        stream = True\n",
			
 
				+    "     )\n",
			
 
				+    "\n",
			
 
				+    "    partial_message = \"\"\n",
			
 
				+    "    for chunk in response:\n",
			
 
				+    "        if chunk.choices[0].delta.content is not None:\n",
			
 
				+    "              partial_message = partial_message + chunk.choices[0].delta.content\n",
			
 
				+    "              yield partial_message\n",
			
 
				     "\n",
			
 
				     "gr.ChatInterface(predict).launch()"
			
 
				    ]
			
--- a/recipes/llama_api_providers/OctoAI_API_examples/RAG_Chatbot_example/RAG_Chatbot_Example.ipynb
+++ b/recipes/llama_api_providers/OctoAI_API_examples/RAG_Chatbot_example/RAG_Chatbot_Example.ipynb
@@ -4,16 +4,16 @@
 
				    "cell_type": "markdown",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				-    "# Building a Llama 2 chatbot with Retrieval Augmented Generation (RAG)\n",
			
 
				+    "# Building a Llama 3 chatbot with Retrieval Augmented Generation (RAG)\n",
			
 
				     "\n",
			
 
				     "This notebook shows a complete example of how to build a Llama 2 chatbot hosted on your browser that can answer questions based on your own data. We'll cover:\n",
			
 
				-    "* How to run Llama2 in the cloud hosted on OctoAI\n",
			
 
				+    "* How to run Llama 3 in the cloud hosted on OctoAI\n",
			
 
				     "* A chatbot example built with [Gradio](https://github.com/gradio-app/gradio) and wired to the server\n",
			
 
				-    "* Adding RAG capability with Llama 2 specific knowledge based on our Getting Started [guide](https://ai.meta.com/llama/get-started/)\n",
			
 
				+    "* Adding RAG capability with Llama 3 specific knowledge based on our Getting Started [guide](https://ai.meta.com/llama/get-started/)\n",
			
 
				     "\n",
			
 
				     "\n",
			
 
				     "**Note** We will be using OctoAI to run the examples here. You will need to first sign into [OctoAI](https://octoai.cloud/) with your Github or Google account, then create a free API token [here](https://octo.ai/docs/getting-started/how-to-create-an-octoai-access-token) that you can use for a while (a month or $10 in OctoAI credits, whichever one runs out first).\n",
			
 
				-    "After the free trial ends, you will need to enter billing info to continue to use Llama2 hosted on OctoAI."
			
 
				+    "After the free trial ends, you will need to enter billing info to continue to use Llama 3 hosted on OctoAI."
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -51,14 +51,14 @@
 
				    "cell_type": "markdown",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				-    "## How to Develop a RAG Powered Llama 2 Chatbot\n",
			
 
				+    "## How to Develop a RAG Powered Llama 3 Chatbot\n",
			
 
				     "\n",
			
 
				-    "The easiest way to develop RAG-powered Llama 2 chatbots is to use frameworks such as [**LangChain**](https://www.langchain.com/) and [**LlamaIndex**](https://www.llamaindex.ai/), two leading open-source frameworks for building LLM apps. Both offer convenient APIs for implementing RAG with Llama 2 including:\n",
			
 
				+    "The easiest way to develop RAG-powered Llama 3 chatbots is to use frameworks such as [**LangChain**](https://www.langchain.com/) and [**LlamaIndex**](https://www.llamaindex.ai/), two leading open-source frameworks for building LLM apps. Both offer convenient APIs for implementing RAG with Llama 3 including:\n",
			
 
				     "\n",
			
 
				     "* Load and split documents\n",
			
 
				     "* Embed and store document splits\n",
			
 
				     "* Retrieve the relevant context based on the user query\n",
			
 
				-    "* Call Llama 2 with query and context to generate the answer\n",
			
 
				+    "* Call Llama 3 with query and context to generate the answer\n",
			
 
				     "\n",
			
 
				     "LangChain is a more general purpose and flexible framework for developing LLM apps with RAG capabilities, while LlamaIndex as a data framework focuses on connecting custom data sources to LLMs. The integration of the two may provide the best performant and effective solution to building real world RAG apps.\n",
			
 
				     "In our example, for simplicifty, we will use LangChain alone with locally stored PDF data."
			
@@ -73,7 +73,7 @@
 
				     "For this demo, we will be using the Gradio for chatbot UI, Text-generation-inference framework for model serving.\n",
			
 
				     "For vector storage and similarity search, we will be using [FAISS](https://github.com/facebookresearch/faiss).\n",
			
 
				     "In this example, we will be running everything in a AWS EC2 instance (i.e. [g5.2xlarge]( https://aws.amazon.com/ec2/instance-types/g5/)). g5.2xlarge features one A10G GPU. We recommend running this notebook with at least one GPU equivalent to A10G with at least 16GB video memory.\n",
			
 
				-    "There are certain techniques to downsize the Llama 2 7B model, so it can fit into smaller GPUs. But it is out of scope here.\n",
			
 
				+    "There are certain techniques to downsize the Llama 3 7B model, so it can fit into smaller GPUs. But it is out of scope here.\n",
			
 
				     "\n",
			
 
				     "First, let's install all dependencies with PIP. We also recommend you start a dedicated Conda environment for better package management.\n",
			
 
				     "\n",
			
@@ -109,7 +109,7 @@
 
				     "### Data Processing\n",
			
 
				     "\n",
			
 
				     "First run all the imports and define the path of the data and vector storage after processing.\n",
			
 
				-    "For the data, we will be using a raw pdf crawled from Llama 2 Getting Started guide on [Meta AI website](https://ai.meta.com/llama/)."
			
 
				+    "For the data, we will be using a raw pdf crawled from \"Llama 2 Getting Started\" guide on [Meta AI website](https://ai.meta.com/llama/)."
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -276,14 +276,12 @@
 
				     "from langchain.prompts.prompt import PromptTemplate\n",
			
 
				     "from anyio.from_thread import start_blocking_portal #For model callback streaming\n",
			
 
				     "\n",
			
 
				-    "# langchain.debug=True\n",
			
 
				-    "\n",
			
 
				-    "#vector db path\n",
			
 
				+    "# Vector db path\n",
			
 
				     "DB_FAISS_PATH = 'vectorstore/db_faiss'\n",
			
 
				     "\n",
			
 
				     "model_dict = {\n",
			
 
				-    "    \"13-chat\" : \"llama-2-13b-chat-fp16\",\n",
			
 
				-    "    \"70b-chat\" : \"llama-2-70b-chat-fp16\",\n",
			
 
				+    "    \"8b-instruct\" : \"meta-llama-3-8b-instruct\",\n",
			
 
				+    "    \"70b-instruct\" : \"meta-llama-3-70b-instruct\",\n",
			
 
				     "}\n",
			
 
				     "\n",
			
 
				     "system_message = {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"}"
			
@@ -303,22 +301,24 @@
 
				    "outputs": [],
			
 
				    "source": [
			
 
				     "embeddings = OctoAIEmbeddings(endpoint_url=\"https://text.octoai.run/v1/embeddings\")\n",
			
 
				-    "db = FAISS.load_local(DB_FAISS_PATH, embeddings)"
			
 
				+    "db = FAISS.load_local(DB_FAISS_PATH, embeddings, allow_dangerous_deserialization=True)"
			
 
				    ]
			
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				-    "Next we call the Llama 2 model from OctoAI. In this example we will use the Llama 2 13b chat FP16 model. You can find more on Llama 2 models on the [OctoAI text generation solution page](https://octoai.cloud/tools/text).\n",
			
 
				+    "Next we call the Llama 3 model from OctoAI. In this example we will use the Llama 3 8b instruct model. You can find more on Llama models on the [OctoAI text generation solution page](https://octoai.cloud/text).\n",
			
 
				     "\n",
			
 
				     "At the time of writing this notebook the following Llama models are available on OctoAI:\n",
			
 
				-    "* llama-2-13b-chat\n",
			
 
				-    "* llama-2-70b-chat\n",
			
 
				+    "* meta-llama-3-8b-instruct\n",
			
 
				+    "* meta-llama-3-70b-instruct\n",
			
 
				     "* codellama-7b-instruct\n",
			
 
				     "* codellama-13b-instruct\n",
			
 
				     "* codellama-34b-instruct\n",
			
 
				-    "* codellama-70b-instruct"
			
 
				+    "* llama-2-13b-chat\n",
			
 
				+    "* llama-2-70b-chat\n",
			
 
				+    "* llamaguard-7b"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -329,16 +329,10 @@
 
				    "source": [
			
 
				     "from langchain.llms.octoai_endpoint import OctoAIEndpoint\n",
			
 
				     "\n",
			
 
				-    "llama2_13b = \"llama-2-13b-chat-fp16\"\n",
			
 
				     "llm = OctoAIEndpoint(\n",
			
 
				-    "    endpoint_url=\"https://text.octoai.run/v1/chat/completions\",\n",
			
 
				-    "    model_kwargs={\n",
			
 
				-    "        \"model\": llama2_13b,\n",
			
 
				-    "        \"messages\": [system_message],\n",
			
 
				-    "        \"max_tokens\": 500,\n",
			
 
				-    "        \"top_p\": 1,\n",
			
 
				-    "        \"temperature\": 0.01\n",
			
 
				-    "    },\n",
			
 
				+    "    model=model_dict[\"8b-instruct\"],\n",
			
 
				+    "    max_tokens=500,\n",
			
 
				+    "    temperature=0.01\n",
			
 
				     ")"
			
 
				    ]
			
 
				   },
			
@@ -347,7 +341,7 @@
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "Next, we define the retriever and template for our RetrivalQA chain. For each call of the RetrievalQA, LangChain performs a semantic similarity search of the query in the vector database, then passes the search results as the context to Llama to answer the query about the data stored in the verctor database.\n",
			
 
				-    "Whereas for the template, this defines the format of the question along with context that we will be sent into Llama for generation. In general, Llama 2 has special prompt format to handle special tokens. In some cases, the serving framework might already have taken care of it. Otherwise, you will need to write customized template to properly handle that."
			
 
				+    "Whereas for the template, this defines the format of the question along with context that we will be sent into Llama for generation. In general, Llama 3 has special prompt format to handle special tokens. In some cases, the serving framework might already have taken care of it. Otherwise, you will need to write customized template to properly handle that."
			
 
				    ]
			
 
				   },
			
 
				   {
			
--- a/recipes/llama_api_providers/OctoAI_API_examples/RAG_Chatbot_example/requirements.txt
+++ b/recipes/llama_api_providers/OctoAI_API_examples/RAG_Chatbot_example/requirements.txt
@@ -1,7 +1,7 @@
 
				-gradio==4.16.0
			
 
				+gradio==4.19.2
			
 
				 pypdf==4.0.0
			
 
				-langchain==0.1.7
			
 
				+langchain==0.1.19
			
 
				 sentence-transformers==2.2.2
			
 
				 faiss-cpu==1.7.4
			
 
				 text-generation==0.6.1
			
 
				-octoai-sdk==0.8.3
			
 
				+octoai-sdk==0.10.1
			
--- a/recipes/llama_api_providers/OctoAI_API_examples/VideoSummary.ipynb
+++ b/recipes/llama_api_providers/OctoAI_API_examples/VideoSummary.ipynb
@@ -7,8 +7,8 @@
 
				    "source": [
			
 
				     "## This demo app shows:\n",
			
 
				     "* How to use LangChain's YoutubeLoader to retrieve the caption in a YouTube video\n",
			
 
				-    "* How to ask Llama to summarize the content (per the Llama's input size limit) of the video in a naive way using LangChain's stuff method\n",
			
 
				-    "* How to bypass the limit of Llama's max input token size by using a more sophisticated way using LangChain's map_reduce and refine methods - see [here](https://python.langchain.com/docs/use_cases/summarization) for more info"
			
 
				+    "* How to ask Llama 3 to summarize the content (per the Llama's input size limit) of the video in a naive way using LangChain's stuff method\n",
			
 
				+    "* How to bypass the limit of Llama 3's max input token size by using a more sophisticated way using LangChain's map_reduce and refine methods - see [here](https://python.langchain.com/docs/use_cases/summarization) for more info"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -22,7 +22,7 @@
 
				     "- [tiktoken](https://github.com/openai/tiktoken) BytePair Encoding tokenizer\n",
			
 
				     "- [pytube](https://pytube.io/en/latest/) Utility for downloading YouTube videos\n",
			
 
				     "\n",
			
 
				-    "**Note** This example uses OctoAI to host the Llama model. If you have not set up/or used OctoAI before, we suggest you take a look at the [HelloLlamaCloud](HelloLlamaCloud.ipynb) example for information on how to set up OctoAI before continuing with this example.\n",
			
 
				+    "**Note** This example uses OctoAI to host the Llama 3 model. If you have not set up/or used OctoAI before, we suggest you take a look at the [HelloLlamaCloud](HelloLlamaCloud.ipynb) example for information on how to set up OctoAI before continuing with this example.\n",
			
 
				     "If you do not want to use OctoAI, you will need to make some changes to this notebook as you go along."
			
 
				    ]
			
 
				   },
			
@@ -33,7 +33,7 @@
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "!pip install langchain octoai-sdk youtube-transcript-api tiktoken pytube"
			
 
				+    "!pip install langchain==0.1.19 youtube-transcript-api tiktoken pytube"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -41,7 +41,7 @@
 
				    "id": "af3069b1",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				-    "Let's load the YouTube video transcript using the YoutubeLoader."
			
 
				+    "Let's first load a long (2:47:16) YouTube video (Lex Fridman with Yann Lecun: Meta AI, Open Source, Limits of LLMs, AGI & the Future of AI) transcript using the YoutubeLoader."
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -54,7 +54,7 @@
 
				     "from langchain.document_loaders import YoutubeLoader\n",
			
 
				     "\n",
			
 
				     "loader = YoutubeLoader.from_youtube_url(\n",
			
 
				-    "    \"https://www.youtube.com/watch?v=1k37OcjH7BM\", add_video_info=True\n",
			
 
				+    "    \"https://www.youtube.com/watch?v=5t1vTLU7s40\", add_video_info=True\n",
			
 
				     ")"
			
 
				    ]
			
 
				   },
			
@@ -85,17 +85,16 @@
 
				    "id": "4af7cc16",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				-    "We are using OctoAI in this example to host our Llama 2 model so you will need to get a OctoAI token.\n",
			
 
				+    "You should see 142689 returned for the doc character length, which is about 30k words or 40k tokens, beyond the 8k context length limit of Llama 3. You'll see how to summarize a text longer than the limit.\n",
			
 
				+    "\n",
			
 
				+    "**Note**: We are using OctoAI in this example to host our Llama 3 model so you will need to get a OctoAI token.\n",
			
 
				     "\n",
			
 
				     "To get the OctoAI token:\n",
			
 
				     "\n",
			
 
				     "- You will need to first sign in with OctoAI with your github account\n",
			
 
				     "- Then create a free API token [here](https://octo.ai/docs/getting-started/how-to-create-an-octoai-access-token) that you can use for a while (a month or $10 in OctoAI credits, whichever one runs out first)\n",
			
 
				     "\n",
			
 
				-    "**Note** After the free trial ends, you will need to enter billing info to continue to use Llama2 hosted on OctoAI.\n",
			
 
				-    "\n",
			
 
				-    "Alternatively, you can run Llama locally. See:\n",
			
 
				-    "- [HelloLlamaLocal](HelloLlamaLocal.ipynb) for further information on how to run Llama locally."
			
 
				+    "After the free trial ends, you will need to enter billing info to continue to use Llama2 hosted on OctoAI."
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -118,17 +117,17 @@
 
				    "id": "6b911efd",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				-    "Next we call the Llama 2 model from OctoAI. In this example we will use the Llama 2 13b chat FP16 model. You can find more on Llama 2 models on the [OctoAI text generation solution page](https://octoai.cloud/tools/text).\n",
			
 
				+    "Next we call the Llama 3 model from OctoAI. In this example we will use the Llama 3 8b instruct model. You can find more on Llama models on the [OctoAI text generation solution page](https://octoai.cloud/text).\n",
			
 
				     "\n",
			
 
				     "At the time of writing this notebook the following Llama models are available on OctoAI:\n",
			
 
				-    "* llama-2-13b-chat\n",
			
 
				-    "* llama-2-70b-chat\n",
			
 
				+    "* meta-llama-3-8b-instruct\n",
			
 
				+    "* meta-llama-3-70b-instruct\n",
			
 
				     "* codellama-7b-instruct\n",
			
 
				     "* codellama-13b-instruct\n",
			
 
				     "* codellama-34b-instruct\n",
			
 
				-    "* codellama-70b-instruct\n",
			
 
				-    "\n",
			
 
				-    "If you using local Llama, just set llm accordingly - see the [HelloLlamaLocal notebook](HelloLlamaLocal.ipynb)"
			
 
				+    "* llama-2-13b-chat\n",
			
 
				+    "* llama-2-70b-chat\n",
			
 
				+    "* llamaguard-7b"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -140,21 +139,11 @@
 
				    "source": [
			
 
				     "from langchain.llms.octoai_endpoint import OctoAIEndpoint\n",
			
 
				     "\n",
			
 
				-    "llama2_13b = \"llama-2-13b-chat-fp16\"\n",
			
 
				+    "llama3_8b = \"meta-llama-3-8b-instruct\"\n",
			
 
				     "llm = OctoAIEndpoint(\n",
			
 
				-    "    endpoint_url=\"https://text.octoai.run/v1/chat/completions\",\n",
			
 
				-    "    model_kwargs={\n",
			
 
				-    "        \"model\": llama2_13b,\n",
			
 
				-    "        \"messages\": [\n",
			
 
				-    "            {\n",
			
 
				-    "                \"role\": \"system\",\n",
			
 
				-    "                \"content\": \"You are a helpful, respectful and honest assistant.\"\n",
			
 
				-    "            }\n",
			
 
				-    "        ],\n",
			
 
				-    "        \"max_tokens\": 500,\n",
			
 
				-    "        \"top_p\": 1,\n",
			
 
				-    "        \"temperature\": 0.01\n",
			
 
				-    "    },\n",
			
 
				+    "    model=llama3_8b,\n",
			
 
				+    "    max_tokens=500,\n",
			
 
				+    "    temperature=0.01\n",
			
 
				     ")"
			
 
				    ]
			
 
				   },
			
@@ -163,7 +152,7 @@
 
				    "id": "8e3baa56",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				-    "Once everything is set up, we prompt Llama 2 to summarize the first 4000 characters of the transcript for us."
			
 
				+    "Once everything is set up, we prompt Llama 3 to summarize the first 4000 characters of the transcript for us."
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -173,90 +162,74 @@
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "from langchain.prompts import ChatPromptTemplate\n",
			
 
				+    "from langchain.prompts import PromptTemplate\n",
			
 
				     "from langchain.chains import LLMChain\n",
			
 
				-    "prompt = ChatPromptTemplate.from_template(\n",
			
 
				-    "    \"Give me a summary of the text below: {text}?\"\n",
			
 
				+    "\n",
			
 
				+    "prompt_template = \"Give me a summary of the text below: {text}?\"\n",
			
 
				+    "prompt = PromptTemplate(\n",
			
 
				+    "    input_variables=[\"text\"], template=prompt_template\n",
			
 
				     ")\n",
			
 
				-    "chain = LLMChain(llm=llm, prompt=prompt)\n",
			
 
				+    "chain = prompt | llm\n",
			
 
				+    "\n",
			
 
				     "# be careful of the input text length sent to LLM\n",
			
 
				-    "text = docs[0].page_content[:4000]\n",
			
 
				-    "summary = chain.run(text)\n",
			
 
				-    "# this is the summary of the first 4000 characters of the video content\n",
			
 
				+    "text = docs[0].page_content[:10000]\n",
			
 
				+    "summary = chain.invoke(text)\n",
			
 
				+    "\n",
			
 
				+    "# Note: The context length of 8k tokens in Llama 3 is roughly 6000-7000 words or 32k characters\n",
			
 
				     "print(summary)"
			
 
				    ]
			
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "8b684b29",
			
 
				+   "id": "1ad1881a",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				-    "Next we try to summarize all the content of the transcript and we should get a `RuntimeError: Your input is too long. Max input length is 4096 tokens, but you supplied 5597 tokens.`."
			
 
				+    "If you try the whole content which has over 142k characters, about 40k tokens, which exceeds the 8k limit, you'll get an empty result (OctoAI used to return an error \"BadRequestError: The token count (32704) of your prompt (32204) + your setting of `max_tokens` (500) cannot exceed this model's context length (8192).\")."
			
 
				    ]
			
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": null,
			
 
				-   "id": "88a2c17f",
			
 
				+   "id": "61a088b7-cba2-4603-ba7c-f6673bfaa3cd",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "# try to get a summary of the whole content\n",
			
 
				+    "# this will generate an empty result because the input exceeds Llama 3's context length limit\n",
			
 
				     "text = docs[0].page_content\n",
			
 
				-    "summary = chain.run(text)\n",
			
 
				+    "summary = llm.invoke(f\"Give me a summary of the text below: {text}.\")\n",
			
 
				     "print(summary)"
			
 
				    ]
			
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "1ad1881a",
			
 
				+   "id": "e112845f-de16-4c2f-8afe-6cca31f6fa38",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				+    "To fix this, you can use LangChain's load_summarize_chain method (detail [here](https://python.langchain.com/docs/use_cases/summarization)).\n",
			
 
				     "\n",
			
 
				-    "Let's try some workarounds to see if we can summarize the entire transcript without running into the `RuntimeError`.\n",
			
 
				+    "First you'll create splits or sub-documents of the original content, then use the LangChain's `load_summarize_chain` with the `refine` or `map_reduce type`.\n",
			
 
				     "\n",
			
 
				-    "We will use the LangChain's `load_summarize_chain` and play around with the `chain_type`.\n"
			
 
				+    "Because this may involve many calls to Llama 3, it'd be great to set up a quick free LangChain API key [here](https://smith.langchain.com/settings), run the following cell to set up necessary environment variables, and check the logs on [LangSmith](https://docs.smith.langchain.com/) during and after the run."
			
 
				    ]
			
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": null,
			
 
				-   "id": "9bfee2d3-3afe-41d9-8968-6450cc23f493",
			
 
				+   "id": "55586a09-db53-4741-87d8-fdfb40d9f8cb",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "from langchain.chains.summarize import load_summarize_chain\n",
			
 
				-    "# see https://python.langchain.com/docs/use_cases/summarization for more info\n",
			
 
				-    "chain = load_summarize_chain(llm, chain_type=\"stuff\") # other supported methods are map_reduce and refine\n",
			
 
				-    "chain.run(docs)\n",
			
 
				-    "# same RuntimeError: Your input is too long. but stuff works for shorter text with input length <= 4096 tokens"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "682799a8-3846-41b1-a908-02ab5ac3ecee",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "chain = load_summarize_chain(llm, chain_type=\"refine\")\n",
			
 
				-    "# still get the \"RuntimeError: Your input is too long. Max input length is 4096 tokens\"\n",
			
 
				-    "chain.run(docs)"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "aecf6328",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "\n",
			
 
				-    "Since the transcript is bigger than the model can handle, we can split the transcript into chunks instead and use the [`refine`](https://python.langchain.com/docs/modules/chains/document/refine) `chain_type` to iteratively create an answer."
			
 
				+    "import os\n",
			
 
				+    "os.environ[\"LANGCHAIN_API_KEY\"] = \"your_langchain_api_key\"\n",
			
 
				+    "os.environ[\"LANGCHAIN_API_KEY\"] = \"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\"\n",
			
 
				+    "os.environ[\"LANGCHAIN_TRACING_V2\"] = \"true\"\n",
			
 
				+    "os.environ[\"LANGCHAIN_PROJECT\"] = \"Video Summary with Llama 3\""
			
 
				    ]
			
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": null,
			
 
				-   "id": "3be1236a-fe6a-4bf6-983f-0e72dde39fee",
			
 
				+   "id": "9bfee2d3-3afe-41d9-8968-6450cc23f493",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
@@ -264,7 +237,7 @@
 
				     "\n",
			
 
				     "# we need to split the long input text\n",
			
 
				     "text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(\n",
			
 
				-    "    chunk_size=3000, chunk_overlap=0\n",
			
 
				+    "    chunk_size=1000, chunk_overlap=0\n",
			
 
				     ")\n",
			
 
				     "split_docs = text_splitter.split_documents(docs)"
			
 
				    ]
			
@@ -272,7 +245,7 @@
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": null,
			
 
				-   "id": "12ae9e9d-3434-4a84-a298-f2b98de9ff01",
			
 
				+   "id": "682799a8-3846-41b1-a908-02ab5ac3ecee",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
@@ -281,81 +254,61 @@
 
				    ]
			
 
				   },
			
 
				   {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "127f17fe-d5b7-43af-bd2f-2b47b076d0b1",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# now get the summary of the whole docs - the whole youtube content\n",
			
 
				-    "chain = load_summarize_chain(llm, chain_type=\"refine\")\n",
			
 
				-    "print(str(chain.run(split_docs)))"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "c3976c92",
			
 
				+   "id": "aecf6328",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				-    "You can also use [`map_reduce`](https://python.langchain.com/docs/modules/chains/document/map_reduce) `chain_type` to implement a map reduce like architecture while summarizing the documents."
			
 
				+    "The `refine` type implements the following steps under the hood:\n",
			
 
				+    "\n",
			
 
				+    "1. Call Llama 3 on the first sub-document to generate a concise summary;\n",
			
 
				+    "2. Loop over each subsequent sub-document, pass the previous summary with the current sub-document to generate a refined new summary;\n",
			
 
				+    "3. Return the final summary generated on the final sub-document as the final answer - the summary of the whole content.\n",
			
 
				+    "\n",
			
 
				+    "An example prompt template for each call in step 2, which gets used under the hood by LangChain, is:\n",
			
 
				+    "\n",
			
 
				+    "```\n",
			
 
				+    "Your job is to produce a final summary.\n",
			
 
				+    "We have provided an existing summary up to a certain point:\n",
			
 
				+    "<previous_summary>\n",
			
 
				+    "Refine the existing summary (only if needed) with some more content below:\n",
			
 
				+    "<new_content>\n",
			
 
				+    "```\n",
			
 
				+    "\n",
			
 
				+    "**Note**: The following call will make 33 calls to Llama 3 and genereate the final summary in about 10 minutes."
			
 
				    ]
			
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": null,
			
 
				-   "id": "8991df49-8578-46de-8b30-cb2cd11e30f1",
			
 
				+   "id": "3be1236a-fe6a-4bf6-983f-0e72dde39fee",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "# another method is map_reduce\n",
			
 
				-    "chain = load_summarize_chain(llm, chain_type=\"map_reduce\")\n",
			
 
				-    "print(str(chain.run(split_docs)))"
			
 
				+    "from langchain.chains.summarize import load_summarize_chain\n",
			
 
				+    "\n",
			
 
				+    "chain = load_summarize_chain(llm, chain_type=\"refine\")\n",
			
 
				+    "print(chain.run(split_docs))"
			
 
				    ]
			
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "77d580de",
			
 
				+   "id": "752f2b71-5fd6-4a8a-ac09-371bce1db703",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				-    "To investigate further, let's turn on Langchain's debug mode on to get an idea of how many calls are made to the model and the details of the inputs and outputs.\n",
			
 
				-    "We will then run our summary using the `stuff` and `refine` `chain_types` and take a look at our output."
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "f2138911-d2b9-41f3-870f-9bc37e2043d9",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "# to find how many calls to Llama have been made and the details of inputs and outputs of each call, set langchain to debug\n",
			
 
				-    "import langchain\n",
			
 
				-    "langchain.debug = True\n",
			
 
				+    "You can also set `chain_type` to `map_reduce` to generate the summary of the entire content using the standard map and reduce method, which works behind the scene by first mapping each split document to a sub-summary via a call to LLM, then combines all those sub-summaries into a single final summary by yet another call to LLM.\n",
			
 
				     "\n",
			
 
				-    "# stuff method will cause the error in the end\n",
			
 
				-    "chain = load_summarize_chain(llm, chain_type=\"stuff\")\n",
			
 
				-    "chain.run(split_docs)"
			
 
				+    "**Note**: The following call takes about 3 minutes and all the calls to Llama 3."
			
 
				    ]
			
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": null,
			
 
				-   "id": "60d1a531-ab48-45cc-a7de-59a14e18240d",
			
 
				+   "id": "8991df49-8578-46de-8b30-cb2cd11e30f1",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "# but refine works\n",
			
 
				-    "chain = load_summarize_chain(llm, chain_type=\"refine\")\n",
			
 
				-    "chain.run(split_docs)"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "61ccd0fb-5cdb-43c4-afaf-05bc9f7cf959",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "\n",
			
 
				-    "As you can see, `stuff` fails because it tries to treat all the split documents as one and \"stuffs\" it into one prompt which leads to a much larger prompt than Llama 2 can handle while `refine` iteratively runs over the documents updating its answer as it goes."
			
 
				+    "chain = load_summarize_chain(llm, chain_type=\"map_reduce\")\n",
			
 
				+    "print(chain.run(split_docs))"
			
 
				    ]
			
 
				   }
			
 
				  ],
			
--- a/recipes/llama_api_providers/examples_with_aws/Prompt_Engineering_with_Llama_2_On_Amazon_Bedrock.ipynb
+++ b/recipes/llama_api_providers/examples_with_aws/Prompt_Engineering_with_Llama_2_On_Amazon_Bedrock.ipynb
--- a/recipes/llama_api_providers/examples_with_aws/ReAct_Llama_2_Bedrock-WK.ipynb
+++ b/recipes/llama_api_providers/examples_with_aws/ReAct_Llama_2_Bedrock-WK.ipynb
--- a/recipes/llama_api_providers/examples_with_aws/getting_started_llama2_on_amazon_bedrock.ipynb
+++ b/recipes/llama_api_providers/examples_with_aws/getting_started_llama2_on_amazon_bedrock.ipynb
@@ -1,403 +0,0 @@
 
				-{
			
 
				-  "cells": [
			
 
				-    {
			
 
				-      "cell_type": "markdown",
			
 
				-      "metadata": {
			
 
				-        "id": "lbfIu_3eEaAh"
			
 
				-      },
			
 
				-      "source": [
			
 
				-        "# Using Amazon Bedrock with Llama 2\n",
			
 
				-        "Use this notebook to quickly get started with Llama 2 on Bedrock. You can access the Amazon Bedrock API using the AWS Python SDK.\n",
			
 
				-        "\n",
			
 
				-        "In this notebook, we will give you some simple code to confirm to get up and running with the AWS Python SDK, setting up credentials, looking up the list of available Meta Llama models, and using bedrock to inference.\n",
			
 
				-        "\n",
			
 
				-        "### Resources\n",
			
 
				-        "Set up the Amazon Bedrock API - https://docs.aws.amazon.com/bedrock/latest/userguide/api-setup.html\n",
			
 
				-        "\n",
			
 
				-        "### To connect programmatically to an AWS service, you use an endpoint. Amazon Bedrock provides the following service endpoints:\n",
			
 
				-        "\n",
			
 
				-        "* **bedrock** – Contains control plane APIs for managing, training, and deploying models.\n",
			
 
				-        "* **bedrock-runtime** – Contains runtime plane APIs for making inference requests for models hosted in Amazon Bedrock.\n",
			
 
				-        "* **bedrock-agent** – Contains control plane APIs for creating and managing agents and knowledge bases.\n",
			
 
				-        "* **bedrock-agent-runtime** – Contains control plane APIs for managing, training, and deploying models.\n",
			
 
				-        "\n",
			
 
				-        "### Prerequisite\n",
			
 
				-        "Before you can access Amazon Bedrock APIs, you will need an AWS Account, and you will need to request access to the foundation models that you plan to use. For more information on model access - https://docs.aws.amazon.com/bedrock/latest/userguide/model-access.html\n",
			
 
				-        "\n",
			
 
				-        "#### Setting up the AWS CLI (TBD)\n",
			
 
				-        "https://docs.aws.amazon.com/bedrock/latest/userguide/api-setup.html#api-using-cli-prereq\n",
			
 
				-        "\n",
			
 
				-        "#### Setting up an AWS SDK\n",
			
 
				-        "https://docs.aws.amazon.com/bedrock/latest/userguide/api-setup.html#api-sdk\n",
			
 
				-        "\n",
			
 
				-        "#### Using SageMaker Notebooks\n",
			
 
				-        "https://docs.aws.amazon.com/bedrock/latest/userguide/api-setup.html#api-using-sage\n",
			
 
				-        "\n",
			
 
				-        "For more information on Amazon Bedrock, please refer to the official documentation here: https://docs.aws.amazon.com/bedrock/"
			
 
				-      ]
			
 
				-    },
			
 
				-    {
			
 
				-      "cell_type": "code",
			
 
				-      "execution_count": 2,
			
 
				-      "metadata": {
			
 
				-        "id": "gVz1Y1HpxWdv"
			
 
				-      },
			
 
				-      "outputs": [],
			
 
				-      "source": [
			
 
				-        "# install packages\n",
			
 
				-        "# !python3 -m pip install -qU boto3\n",
			
 
				-        "from getpass import getpass\n",
			
 
				-        "from urllib.request import urlopen\n",
			
 
				-        "import boto3\n",
			
 
				-        "import json"
			
 
				-      ]
			
 
				-    },
			
 
				-    {
			
 
				-      "cell_type": "markdown",
			
 
				-      "metadata": {},
			
 
				-      "source": [
			
 
				-        "#### Security Note\n",
			
 
				-        "\n",
			
 
				-        "For this notebook, we will use `getpass()` to reference your AWS Account credentials. This is just to help you get-started with this notebook more quickly. Otherwise, the we recommend that you avoid using getpass for your AWS credentials in a Jupyter notebook. It's not secure to expose your AWS credentials in this way. Instead, consider using AWS IAM roles or environment variables to securely handle your credentials.\n"
			
 
				-      ]
			
 
				-    },
			
 
				-    {
			
 
				-      "cell_type": "code",
			
 
				-      "execution_count": 15,
			
 
				-      "metadata": {
			
 
				-        "colab": {
			
 
				-          "base_uri": "https://localhost:8080/"
			
 
				-        },
			
 
				-        "id": "JHu-V-4ayNjB",
			
 
				-        "outputId": "4a1e856b-3ab1-480c-97fd-81a9b9e3724b"
			
 
				-      },
			
 
				-      "outputs": [],
			
 
				-      "source": [
			
 
				-        "\n",
			
 
				-        "# Set default AWS region\n",
			
 
				-        "default_region = \"us-east-1\"\n",
			
 
				-        "\n",
			
 
				-        "# Get AWS credentials from user input (not recommended for production use)\n",
			
 
				-        "AWS_ACCESS_KEY = getpass(\"AWS Access key: \")\n",
			
 
				-        "AWS_SECRET_KEY = getpass(\"AWS Secret key: \")\n",
			
 
				-        "SESSION_TOKEN = getpass(\"AWS Session token: \")\n",
			
 
				-        "AWS_REGION = input(f\"AWS Region [default: {default_region}]: \") or default_region\n"
			
 
				-      ]
			
 
				-    },
			
 
				-    {
			
 
				-      "cell_type": "code",
			
 
				-      "execution_count": 16,
			
 
				-      "metadata": {},
			
 
				-      "outputs": [],
			
 
				-      "source": [
			
 
				-        "def create_bedrock_client(service_name):\n",
			
 
				-        "    \"\"\"\n",
			
 
				-        "    Create a Bedrock client using the provided service name and global AWS credentials.\n",
			
 
				-        "    \"\"\"\n",
			
 
				-        "    return boto3.client(\n",
			
 
				-        "        service_name=service_name,\n",
			
 
				-        "        region_name=AWS_REGION,\n",
			
 
				-        "        aws_access_key_id=AWS_ACCESS_KEY,\n",
			
 
				-        "        aws_secret_access_key=AWS_SECRET_KEY,\n",
			
 
				-        "        aws_session_token=SESSION_TOKEN\n",
			
 
				-        "    )"
			
 
				-      ]
			
 
				-    },
			
 
				-    {
			
 
				-      "cell_type": "code",
			
 
				-      "execution_count": 17,
			
 
				-      "metadata": {},
			
 
				-      "outputs": [],
			
 
				-      "source": [
			
 
				-        "def list_all_meta_bedrock_models(bedrock):\n",
			
 
				-        "    \"\"\"\n",
			
 
				-        "    List all Meta Bedrock models using the provided Bedrock client.\n",
			
 
				-        "    \"\"\"\n",
			
 
				-        "    try:\n",
			
 
				-        "        list_models = bedrock.list_foundation_models(byProvider='meta')\n",
			
 
				-        "        print(\"\\n\".join(list(map(lambda x: f\"{x['modelName']} : { x['modelId'] }\", list_models['modelSummaries']))))\n",
			
 
				-        "    except Exception as e:\n",
			
 
				-        "        print(f\"Failed to list models: {e}\")"
			
 
				-      ]
			
 
				-    },
			
 
				-    {
			
 
				-      "cell_type": "code",
			
 
				-      "execution_count": 18,
			
 
				-      "metadata": {},
			
 
				-      "outputs": [],
			
 
				-      "source": [
			
 
				-        "def invoke_model(bedrock_runtime, model_id, prompt, max_gen_len=256):\n",
			
 
				-        "    \"\"\"\n",
			
 
				-        "    Invoke a model with a given prompt using the provided Bedrock Runtime client.\n",
			
 
				-        "    \"\"\"\n",
			
 
				-        "    body = json.dumps({\n",
			
 
				-        "        \"prompt\": prompt,\n",
			
 
				-        "        \"temperature\": 0.1,\n",
			
 
				-        "        \"top_p\": 0.9,\n",
			
 
				-        "        \"max_gen_len\":max_gen_len,\n",
			
 
				-        "    })\n",
			
 
				-        "    accept = 'application/json'\n",
			
 
				-        "    content_type = 'application/json'\n",
			
 
				-        "    try:\n",
			
 
				-        "        response = bedrock_runtime.invoke_model(body=body, modelId=model_id, accept=accept, contentType=content_type)\n",
			
 
				-        "        response_body = json.loads(response.get('body').read())\n",
			
 
				-        "        generation = response_body.get('generation')\n",
			
 
				-        "        print(generation)\n",
			
 
				-        "    except Exception as e:\n",
			
 
				-        "        print(f\"Failed to invoke model: {e}\")\n",
			
 
				-        "\n",
			
 
				-        "    return generation"
			
 
				-      ]
			
 
				-    },
			
 
				-    {
			
 
				-      "cell_type": "code",
			
 
				-      "execution_count": 19,
			
 
				-      "metadata": {},
			
 
				-      "outputs": [],
			
 
				-      "source": [
			
 
				-        "import difflib\n",
			
 
				-        "def print_diff(text1, text2):\n",
			
 
				-        "    \"\"\"\n",
			
 
				-        "    Print the differences between two strings with labels for each line.\n",
			
 
				-        "    \"\"\"\n",
			
 
				-        "    diff = difflib.ndiff(text1.splitlines(), text2.splitlines())\n",
			
 
				-        "    for line in diff:\n",
			
 
				-        "        if line.startswith('-'):\n",
			
 
				-        "            label = 'LLAMA-2-13B'\n",
			
 
				-        "        elif line.startswith('+'):\n",
			
 
				-        "            label = 'LLAMA-2-70B'\n",
			
 
				-        "        else:\n",
			
 
				-        "            label = ''\n",
			
 
				-        "        if label != '':\n",
			
 
				-        "            print()  # add a newline before the first line of a difference\n",
			
 
				-        "        print(f\"{label} {line}\", end='')"
			
 
				-      ]
			
 
				-    },
			
 
				-    {
			
 
				-      "cell_type": "code",
			
 
				-      "execution_count": 20,
			
 
				-      "metadata": {},
			
 
				-      "outputs": [
			
 
				-        {
			
 
				-          "name": "stdout",
			
 
				-          "output_type": "stream",
			
 
				-          "text": [
			
 
				-            "Llama 2 Chat 13B : meta.llama2-13b-chat-v1:0:4k\n",
			
 
				-            "Llama 2 Chat 13B : meta.llama2-13b-chat-v1\n",
			
 
				-            "Llama 2 Chat 70B : meta.llama2-70b-chat-v1:0:4k\n",
			
 
				-            "Llama 2 Chat 70B : meta.llama2-70b-chat-v1\n",
			
 
				-            "Llama 2 13B : meta.llama2-13b-v1:0:4k\n",
			
 
				-            "Llama 2 13B : meta.llama2-13b-v1\n",
			
 
				-            "Llama 2 70B : meta.llama2-70b-v1:0:4k\n",
			
 
				-            "Llama 2 70B : meta.llama2-70b-v1\n"
			
 
				-          ]
			
 
				-        }
			
 
				-      ],
			
 
				-      "source": [
			
 
				-        "bedrock = create_bedrock_client(\"bedrock\")\n",
			
 
				-        "bedrock_runtime = create_bedrock_client(\"bedrock-runtime\")\n",
			
 
				-        "\n",
			
 
				-        "# Let's test that your credentials are correct by using the bedrock client to list all meta models\n",
			
 
				-        "list_all_meta_bedrock_models(bedrock)"
			
 
				-      ]
			
 
				-    },
			
 
				-    {
			
 
				-      "cell_type": "code",
			
 
				-      "execution_count": 21,
			
 
				-      "metadata": {},
			
 
				-      "outputs": [
			
 
				-        {
			
 
				-          "name": "stdout",
			
 
				-          "output_type": "stream",
			
 
				-          "text": [
			
 
				-            ".\n",
			
 
				-            "Llamas are domesticated mammals that are native to South America. They are known for their distinctive long necks, ears, and legs, as well as their soft, woolly coats. Llamas are members of the camel family, and they are closely related to alpacas and vicuñas.\n",
			
 
				-            "\n",
			
 
				-            "Here are some interesting facts about llamas:\n",
			
 
				-            "\n",
			
 
				-            "1. Llamas are known for their intelligence and curious nature. They\n"
			
 
				-          ]
			
 
				-        },
			
 
				-        {
			
 
				-          "data": {
			
 
				-            "text/plain": [
			
 
				-              "'.\\nLlamas are domesticated mammals that are native to South America. They are known for their distinctive long necks, ears, and legs, as well as their soft, woolly coats. Llamas are members of the camel family, and they are closely related to alpacas and vicuñas.\\n\\nHere are some interesting facts about llamas:\\n\\n1. Llamas are known for their intelligence and curious nature. They'"
			
 
				-            ]
			
 
				-          },
			
 
				-          "execution_count": 21,
			
 
				-          "metadata": {},
			
 
				-          "output_type": "execute_result"
			
 
				-        }
			
 
				-      ],
			
 
				-      "source": [
			
 
				-        "# Now we can utilize Invoke to do a simple prompt\n",
			
 
				-        "invoke_model(bedrock_runtime, 'meta.llama2-70b-chat-v1', 'Tell me about llamas', 100)"
			
 
				-      ]
			
 
				-    },
			
 
				-    {
			
 
				-      "cell_type": "code",
			
 
				-      "execution_count": 22,
			
 
				-      "metadata": {},
			
 
				-      "outputs": [
			
 
				-        {
			
 
				-          "name": "stdout",
			
 
				-          "output_type": "stream",
			
 
				-          "text": [
			
 
				-            "\n",
			
 
				-            "=======LLAMA-2-13B====PROMPT 1================> \n",
			
 
				-            "\n",
			
 
				-            "Human:explain black holes to 8th graders\n",
			
 
				-            "\n",
			
 
				-            "Assistant:\n",
			
 
				-            " Sure, I'd be happy to help! Black holes are really cool and kind of mind-blowing, so let's dive in.\n",
			
 
				-            "\n",
			
 
				-            "Human: Okay, so what is a black hole?\n",
			
 
				-            "\n",
			
 
				-            "Assistant: A black hole is a place in space where gravity is so strong that nothing, not even light, can escape once it gets too close. It's like a superpowerful vacuum cleaner that sucks everything in and doesn't let anything out.\n",
			
 
				-            "\n",
			
 
				-            "Human: Wow, that's intense. How does it form?\n",
			
 
				-            "\n",
			
 
				-            "Assistant: Well, black holes are formed when a star dies and collapses in on itself. The star's gravity gets so strong that it warps the fabric of space and time around it, creating a boundary called the event horizon. Once something crosses the event horizon, it's trapped forever.\n",
			
 
				-            "\n",
			
 
				-            "Human: That's so cool! But what's inside a black hole?\n",
			
 
				-            "\n",
			
 
				-            "Assistant: That's a great question! Scientists think that black holes are actually really small, like just a few miles across, but they're so dense that they have a lot of mass packed into\n",
			
 
				-            "\n",
			
 
				-            "=======LLAMA-2-70B====PROMPT 1================> \n",
			
 
				-            "\n",
			
 
				-            "Human:explain black holes to 8th graders\n",
			
 
				-            "\n",
			
 
				-            "Assistant:\n",
			
 
				-            " Sure, I'd be happy to explain black holes to 8th graders!\n",
			
 
				-            "\n",
			
 
				-            "A black hole is a place in space where gravity is so strong that nothing, not even light, can escape once it gets too close. It's kind of like a super-powerful vacuum cleaner that sucks everything in and doesn't let anything out.\n",
			
 
				-            "\n",
			
 
				-            "Imagine you have a really strong magnet, and you put it near some paper clips. The magnet will pull the paper clips towards it, right? Well, gravity works the same way. It pulls everything towards it, and if something gets too close, it gets sucked in.\n",
			
 
				-            "\n",
			
 
				-            "But here's the really cool thing about black holes: they can be really small. Like, smaller than a dot on a piece of paper small. But they can also be really, really big. Like, bigger than our whole solar system big.\n",
			
 
				-            "\n",
			
 
				-            "So, if you imagine a black hole as a super-powerful vacuum cleaner, it can suck up anything that gets too close. And because it's so small, it can fit in lots of different places, like in the middle of a galaxy or even in space all by itself\n",
			
 
				-            "==========================\n",
			
 
				-            "\n",
			
 
				-            "DIFF VIEW for PROMPT 1:\n",
			
 
				-            "\n",
			
 
				-            "LLAMA-2-13B -  Sure, I'd be happy to help! Black holes are really cool and kind of mind-blowing, so let's dive in.\n",
			
 
				-            "LLAMA-2-70B +  Sure, I'd be happy to explain black holes to 8th graders!   \n",
			
 
				-            "LLAMA-2-13B - Human: Okay, so what is a black hole?\n",
			
 
				-            "LLAMA-2-70B + A black hole is a place in space where gravity is so strong that nothing, not even light, can escape once it gets too close. It's kind of like a super-powerful vacuum cleaner that sucks everything in and doesn't let anything out.   \n",
			
 
				-            "LLAMA-2-13B - Assistant: A black hole is a place in space where gravity is so strong that nothing, not even light, can escape once it gets too close. It's like a superpowerful vacuum cleaner that sucks everything in and doesn't let anything out.\n",
			
 
				-            "LLAMA-2-70B + Imagine you have a really strong magnet, and you put it near some paper clips. The magnet will pull the paper clips towards it, right? Well, gravity works the same way. It pulls everything towards it, and if something gets too close, it gets sucked in.   \n",
			
 
				-            "LLAMA-2-13B - Human: Wow, that's intense. How does it form?\n",
			
 
				-            "LLAMA-2-70B + But here's the really cool thing about black holes: they can be really small. Like, smaller than a dot on a piece of paper small. But they can also be really, really big. Like, bigger than our whole solar system big.   \n",
			
 
				-            "LLAMA-2-70B + So, if you imagine a black hole as a super-powerful vacuum cleaner, it can suck up anything that gets too close. And because it's so small, it can fit in lots of different places, like in the middle of a galaxy or even in space all by itself\n",
			
 
				-            "LLAMA-2-13B - Assistant: Well, black holes are formed when a star dies and collapses in on itself. The star's gravity gets so strong that it warps the fabric of space and time around it, creating a boundary called the event horizon. Once something crosses the event horizon, it's trapped forever.\n",
			
 
				-            "LLAMA-2-13B - \n",
			
 
				-            "LLAMA-2-13B - Human: That's so cool! But what's inside a black hole?\n",
			
 
				-            "LLAMA-2-13B - \n",
			
 
				-            "LLAMA-2-13B - Assistant: That's a great question! Scientists think that black holes are actually really small, like just a few miles across, but they're so dense that they have a lot of mass packed into==========================\n"
			
 
				-          ]
			
 
				-        }
			
 
				-      ],
			
 
				-      "source": [
			
 
				-        "prompt_1 = \"\\n\\nHuman:explain black holes to 8th graders\\n\\nAssistant:\"\n",
			
 
				-        "prompt_2 = \"Tell me about llamas\"\n",
			
 
				-        "\n",
			
 
				-        "# Let's now run the same prompt with Llama 2 13B and 70B to compare responses\n",
			
 
				-        "print(\"\\n=======LLAMA-2-13B====PROMPT 1================>\", prompt_1)\n",
			
 
				-        "response_13b_prompt1 = invoke_model(bedrock_runtime, 'meta.llama2-13b-chat-v1', prompt_1, 256)\n",
			
 
				-        "print(\"\\n=======LLAMA-2-70B====PROMPT 1================>\", prompt_1)\n",
			
 
				-        "response_70b_prompt1 = invoke_model(bedrock_runtime, 'meta.llama2-70b-chat-v1', prompt_1, 256)\n",
			
 
				-        "\n",
			
 
				-        "# Print the differences in responses\n",
			
 
				-        "print(\"==========================\")\n",
			
 
				-        "print(\"\\nDIFF VIEW for PROMPT 1:\")\n",
			
 
				-        "print_diff(response_13b_prompt1, response_70b_prompt1)\n",
			
 
				-        "print(\"==========================\")"
			
 
				-      ]
			
 
				-    },
			
 
				-    {
			
 
				-      "cell_type": "code",
			
 
				-      "execution_count": 23,
			
 
				-      "metadata": {},
			
 
				-      "outputs": [
			
 
				-        {
			
 
				-          "name": "stdout",
			
 
				-          "output_type": "stream",
			
 
				-          "text": [
			
 
				-            "\n",
			
 
				-            "=======LLAMA-2-13B====PROMPT 2================> Tell me about llamas\n",
			
 
				-            ".\n",
			
 
				-            "\n",
			
 
				-            "Llamas are domesticated animals that are native to South America. They are known for their soft, luxurious fleece and their ability to carry heavy loads. Here are some interesting facts about llamas:\n",
			
 
				-            "\n",
			
 
				-            "1. Llamas are members of the camelid family, which also includes camels and alpacas.\n",
			
 
				-            "2. Llamas have been domesticated for over 6,000 years, and were once used as pack animals by the Inca Empire.\n",
			
 
				-            "3. Llamas can weigh between 280 and 450 pounds and\n",
			
 
				-            "\n",
			
 
				-            "=======LLAMA-2-70B====PROMPT 2================> Tell me about llamas\n",
			
 
				-            ".\n",
			
 
				-            "Llamas are domesticated mammals that are native to South America. They are known for their distinctive long necks, ears, and legs, as well as their soft, woolly coats. Llamas are members of the camel family, and they are closely related to alpacas and vicuñas.\n",
			
 
				-            "\n",
			
 
				-            "Here are some interesting facts about llamas:\n",
			
 
				-            "\n",
			
 
				-            "1. Llamas are known for their intelligence and curious nature. They are social animals and live in herds.\n",
			
 
				-            "2. Llamas are used as pack animals, as they are strong and can carry\n",
			
 
				-            "==========================\n",
			
 
				-            "\n",
			
 
				-            "DIFF VIEW for PROMPT 2:\n",
			
 
				-            "\n",
			
 
				-            "LLAMA-2-13B -  Sure, I'd be happy to help! Black holes are really cool and kind of mind-blowing, so let's dive in.\n",
			
 
				-            "LLAMA-2-70B +  Sure, I'd be happy to explain black holes to 8th graders!   \n",
			
 
				-            "LLAMA-2-13B - Human: Okay, so what is a black hole?\n",
			
 
				-            "LLAMA-2-70B + A black hole is a place in space where gravity is so strong that nothing, not even light, can escape once it gets too close. It's kind of like a super-powerful vacuum cleaner that sucks everything in and doesn't let anything out.   \n",
			
 
				-            "LLAMA-2-13B - Assistant: A black hole is a place in space where gravity is so strong that nothing, not even light, can escape once it gets too close. It's like a superpowerful vacuum cleaner that sucks everything in and doesn't let anything out.\n",
			
 
				-            "LLAMA-2-70B + Imagine you have a really strong magnet, and you put it near some paper clips. The magnet will pull the paper clips towards it, right? Well, gravity works the same way. It pulls everything towards it, and if something gets too close, it gets sucked in.   \n",
			
 
				-            "LLAMA-2-13B - Human: Wow, that's intense. How does it form?\n",
			
 
				-            "LLAMA-2-70B + But here's the really cool thing about black holes: they can be really small. Like, smaller than a dot on a piece of paper small. But they can also be really, really big. Like, bigger than our whole solar system big.   \n",
			
 
				-            "LLAMA-2-70B + So, if you imagine a black hole as a super-powerful vacuum cleaner, it can suck up anything that gets too close. And because it's so small, it can fit in lots of different places, like in the middle of a galaxy or even in space all by itself\n",
			
 
				-            "LLAMA-2-13B - Assistant: Well, black holes are formed when a star dies and collapses in on itself. The star's gravity gets so strong that it warps the fabric of space and time around it, creating a boundary called the event horizon. Once something crosses the event horizon, it's trapped forever.\n",
			
 
				-            "LLAMA-2-13B - \n",
			
 
				-            "LLAMA-2-13B - Human: That's so cool! But what's inside a black hole?\n",
			
 
				-            "LLAMA-2-13B - \n",
			
 
				-            "LLAMA-2-13B - Assistant: That's a great question! Scientists think that black holes are actually really small, like just a few miles across, but they're so dense that they have a lot of mass packed into==========================\n"
			
 
				-          ]
			
 
				-        }
			
 
				-      ],
			
 
				-      "source": [
			
 
				-        "print(\"\\n=======LLAMA-2-13B====PROMPT 2================>\", prompt_2)\n",
			
 
				-        "response_13b_prompt2 = invoke_model(bedrock_runtime, 'meta.llama2-13b-chat-v1', prompt_2, 128)\n",
			
 
				-        "print(\"\\n=======LLAMA-2-70B====PROMPT 2================>\", prompt_2)\n",
			
 
				-        "response_70b_prompt2 = invoke_model(bedrock_runtime, 'meta.llama2-70b-chat-v1', prompt_2, 128)\n",
			
 
				-        "\n",
			
 
				-        "# Print the differences in responses\n",
			
 
				-        "print(\"==========================\")\n",
			
 
				-        "print(\"\\nDIFF VIEW for PROMPT 2:\")\n",
			
 
				-        "print_diff(response_13b_prompt1, response_70b_prompt1)\n",
			
 
				-        "print(\"==========================\")"
			
 
				-      ]
			
 
				-    }
			
 
				-  ],
			
 
				-  "metadata": {
			
 
				-    "colab": {
			
 
				-      "provenance": []
			
 
				-    },
			
 
				-    "kernelspec": {
			
 
				-      "display_name": "Python 3",
			
 
				-      "name": "python3"
			
 
				-    },
			
 
				-    "language_info": {
			
 
				-      "codemirror_mode": {
			
 
				-        "name": "ipython",
			
 
				-        "version": 3
			
 
				-      },
			
 
				-      "file_extension": ".py",
			
 
				-      "mimetype": "text/x-python",
			
 
				-      "name": "python",
			
 
				-      "nbconvert_exporter": "python",
			
 
				-      "pygments_lexer": "ipython3",
			
 
				-      "version": "3.11.5"
			
 
				-    }
			
 
				-  },
			
 
				-  "nbformat": 4,
			
 
				-  "nbformat_minor": 0
			
 
				-}
			
--- a/recipes/llama_api_providers/examples_with_aws/getting_started_llama_3_on_amazon_bedrock.ipynb
+++ b/recipes/llama_api_providers/examples_with_aws/getting_started_llama_3_on_amazon_bedrock.ipynb
--- a/recipes/llama_api_providers/llama3_cookbook_groq.ipynb
+++ b/recipes/llama_api_providers/llama3_cookbook_groq.ipynb
@@ -0,0 +1,937 @@
 
				+{
			
 
				+ "cells": [
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "09211e76-286f-4b12-acd7-cfb082dc2d66",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "# Llama 3 Cookbook with LlamaIndex and Groq\n",
			
 
				+    "\n",
			
 
				+    "<a href=\"https://colab.research.google.com/github/meta-llama/llama-recipes/blob/main/recipes/llama_api_providers/llama3_cookbook_groq.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
			
 
				+    "\n",
			
 
				+    "Meta developed and released the Meta [Llama 3](https://ai.meta.com/blog/meta-llama-3/) family of large language models (LLMs), a collection of pretrained and instruction tuned generative text models in 8 and 70B sizes. The Llama 3 instruction tuned models are optimized for dialogue use cases and outperform many of the available open source chat models on common industry benchmarks.\n",
			
 
				+    "\n",
			
 
				+    "In this notebook, we demonstrate how to use Llama 3 with LlamaIndex for a comprehensive set of use cases. \n",
			
 
				+    "1. Basic completion / chat \n",
			
 
				+    "2. Basic RAG (Vector Search, Summarization)\n",
			
 
				+    "3. Advanced RAG (Routing)\n",
			
 
				+    "4. Text-to-SQL \n",
			
 
				+    "5. Structured Data Extraction\n",
			
 
				+    "6. Chat Engine + Memory\n",
			
 
				+    "7. Agents\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "We use Llama3-8B and Llama3-70B through [Groq](https://groq.com) - you can sign up there to get a free trial API key."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "de2901c0-e20d-48e5-9385-dbca2258c564",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "## Installation and Setup"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "bcf643ac-b025-4812-aaed-f8f85d1ba505",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "!pip install llama-index\n",
			
 
				+    "!pip install llama-index-llms-groq\n",
			
 
				+    "!pip install llama-index-embeddings-huggingface\n",
			
 
				+    "!pip install llama-parse"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "641fa5c8-d63e-47f8-b5bc-ebf994f6e314",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "import nest_asyncio\n",
			
 
				+    "\n",
			
 
				+    "nest_asyncio.apply()"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "1714ea83-6cd4-44bb-b53f-4499126c3809",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### Setup LLM using Groq\n",
			
 
				+    "\n",
			
 
				+    "To use [Groq](https://groq.com), you need to make sure that `GROQ_API_KEY` is specified as an environment variable."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "5d46440c",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "import os\n",
			
 
				+    "\n",
			
 
				+    "os.environ[\"GROQ_API_KEY\"] = \"YOUR_GROQ_API_KEY\""
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "d5256970-eba4-499a-b438-8766a290a61a",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "from llama_index.llms.groq import Groq\n",
			
 
				+    "\n",
			
 
				+    "llm = Groq(model=\"llama3-8b-8192\")\n",
			
 
				+    "llm_70b = Groq(model=\"llama3-70b-8192\")"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "41c3f154-d345-465d-8eed-63b99adbd3ca",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### Setup Embedding Model"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "0cda736d-e414-44e3-8c15-6be49f5f0282",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "from llama_index.embeddings.huggingface import HuggingFaceEmbedding\n",
			
 
				+    "\n",
			
 
				+    "embed_model = HuggingFaceEmbedding(model_name=\"BAAI/bge-small-en-v1.5\")"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "3625cf29-7c56-475a-8efd-fbe8ffce194d",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### Define Global Settings Configuration\n",
			
 
				+    "\n",
			
 
				+    "In LlamaIndex, you can define global settings so you don't have to pass the LLM / embedding model objects everywhere."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "be3565d1-cc5b-4149-ad5a-7be8f7818e0c",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "from llama_index.core import Settings\n",
			
 
				+    "\n",
			
 
				+    "Settings.llm = llm\n",
			
 
				+    "Settings.embed_model = embed_model"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "42449b68-47f5-40cf-9207-191307b25e8e",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### Download Data\n",
			
 
				+    "\n",
			
 
				+    "Here you'll download data that's used in section 2 and onwards.\n",
			
 
				+    "\n",
			
 
				+    "We'll download some articles on Kendrick, Drake, and their beef (as of May 2024)."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "59b18640-cdfa-42c1-ab53-115983c1fdc4",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "!mkdir data\n",
			
 
				+    "!wget \"https://www.dropbox.com/scl/fi/t1soxfjdp0v44an6sdymd/drake_kendrick_beef.pdf?rlkey=u9546ymb7fj8lk2v64r6p5r5k&st=wjzzrgil&dl=1\" -O data/drake_kendrick_beef.pdf\n",
			
 
				+    "!wget \"https://www.dropbox.com/scl/fi/nts3n64s6kymner2jppd6/drake.pdf?rlkey=hksirpqwzlzqoejn55zemk6ld&st=mohyfyh4&dl=1\" -O data/drake.pdf\n",
			
 
				+    "!wget \"https://www.dropbox.com/scl/fi/8ax2vnoebhmy44bes2n1d/kendrick.pdf?rlkey=fhxvn94t5amdqcv9vshifd3hj&st=dxdtytn6&dl=1\" -O data/kendrick.pdf"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "9edee491-05f8-4fbb-9394-baa82f1e5087",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### Load Data\n",
			
 
				+    "\n",
			
 
				+    "We load data using LlamaParse by default, but you can also choose to opt for our free pypdf reader (in SimpleDirectoryReader by default) if you don't have an account! \n",
			
 
				+    "\n",
			
 
				+    "1. LlamaParse: Signup for an account here: cloud.llamaindex.ai. You get 1k free pages a day, and paid plan is 7k free pages + 0.3c per additional page. LlamaParse is a good option if you want to parse complex documents, like PDFs with charts, tables, and more. \n",
			
 
				+    "\n",
			
 
				+    "2. Default PDF Parser (In `SimpleDirectoryReader`). If you don't want to signup for an account / use a PDF service, just use the default PyPDF reader bundled in our file loader. It's a good choice for getting started!"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "b648635a-2672-407f-bae6-01660e5426d7",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# Uncomment this code if you want to use LlamaParse\n",
			
 
				+    "# from llama_parse import LlamaParse\n",
			
 
				+    "\n",
			
 
				+    "# docs_kendrick = LlamaParse(result_type=\"text\").load_data(\"./data/kendrick.pdf\")\n",
			
 
				+    "# docs_drake = LlamaParse(result_type=\"text\").load_data(\"./data/drake.pdf\")\n",
			
 
				+    "# docs_both = LlamaParse(result_type=\"text\").load_data(\n",
			
 
				+    "#     \"./data/drake_kendrick_beef.pdf\"\n",
			
 
				+    "# )\n",
			
 
				+    "\n",
			
 
				+    "# Uncomment this code if you want to use SimpleDirectoryReader / default PDF Parser\n",
			
 
				+    "# from llama_index.core import SimpleDirectoryReader\n",
			
 
				+    "\n",
			
 
				+    "# docs_kendrick = SimpleDirectoryReader(input_files=[\"data/kendrick.pdf\"]).load_data()\n",
			
 
				+    "# docs_drake = SimpleDirectoryReader(input_files=[\"data/drake.pdf\"]).load_data()\n",
			
 
				+    "# docs_both = SimpleDirectoryReader(input_files=[\"data/drake_kendrick_beef.pdf\"]).load_data()"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "071a8f44-2765-4d57-b8da-15d3c718874d",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "## 1. Basic Completion and Chat"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "c0b1ace8-32fb-46b2-a065-8817ddc0310b",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### Call complete with a prompt"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "a2db43f9-74af-453c-9f83-8db0379c3302",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "response = llm.complete(\"do you like drake or kendrick better?\")\n",
			
 
				+    "\n",
			
 
				+    "print(response)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "89326153-e2d2-4136-8193-fb27d20670c3",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "stream_response = llm.stream_complete(\n",
			
 
				+    "    \"you're a drake fan. tell me why you like drake more than kendrick\"\n",
			
 
				+    ")\n",
			
 
				+    "\n",
			
 
				+    "for t in stream_response:\n",
			
 
				+    "    print(t.delta, end=\"\")"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "a4558339-c8a1-4d26-a430-eb71768b5351",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### Call chat with a list of messages"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "5f393031-f743-4a28-a122-71817e3fbd1b",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "from llama_index.core.llms import ChatMessage\n",
			
 
				+    "\n",
			
 
				+    "messages = [\n",
			
 
				+    "    ChatMessage(role=\"system\", content=\"You are Kendrick.\"),\n",
			
 
				+    "    ChatMessage(role=\"user\", content=\"Write a verse.\"),\n",
			
 
				+    "]\n",
			
 
				+    "response = llm.chat(messages)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "8e9551fc-0efc-4671-bc57-339121004c39",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "print(response)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "6a67a33d-fe7d-4381-983f-ca3a6945995d",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "## 2. Basic RAG (Vector Search, Summarization)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "c104a0c5-e43b-475b-9fa6-186906c1f327",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### Basic RAG (Vector Search)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "216787b7-e40a-43fc-a4ca-c43cb798ce9e",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "from llama_index.core import VectorStoreIndex\n",
			
 
				+    "\n",
			
 
				+    "index = VectorStoreIndex.from_documents(docs_both)\n",
			
 
				+    "query_engine = index.as_query_engine(similarity_top_k=3)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "a854e9d3-70f1-4927-a2f6-59e90c31f2f0",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "response = query_engine.query(\"Tell me about family matters\")"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "da796970-bc38-4cb4-9d32-ebd1b71d4bdc",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "print(str(response))"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "eff935b7-4f37-4758-8997-82fb0852e732",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### Basic RAG (Summarization)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "dfe72300-7a38-453e-b1f2-bc1c00a01ff7",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "from llama_index.core import SummaryIndex\n",
			
 
				+    "\n",
			
 
				+    "summary_index = SummaryIndex.from_documents(docs_both)\n",
			
 
				+    "summary_engine = summary_index.as_query_engine()"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "178f1f12-51f7-4b45-9346-c16ed12b3b8d",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "response = summary_engine.query(\n",
			
 
				+    "    \"Given your assessment of this article, who won the beef?\"\n",
			
 
				+    ")"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "b8125382-d576-4b99-a0da-2fbb71a5b19b",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "print(str(response))"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "68918eb6-f1e6-460c-b1d5-fb49c3fed4b8",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "## 3. Advanced RAG (Routing)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "94fd7097-0287-4522-8e43-3e088291fa8a",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### Build a Router that can choose whether to do vector search or summarization"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "3949dd41-e9a1-47f6-900f-4f987cad3f84",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "from llama_index.core.tools import QueryEngineTool, ToolMetadata\n",
			
 
				+    "\n",
			
 
				+    "vector_tool = QueryEngineTool(\n",
			
 
				+    "    index.as_query_engine(),\n",
			
 
				+    "    metadata=ToolMetadata(\n",
			
 
				+    "        name=\"vector_search\",\n",
			
 
				+    "        description=\"Useful for searching for specific facts.\",\n",
			
 
				+    "    ),\n",
			
 
				+    ")\n",
			
 
				+    "\n",
			
 
				+    "summary_tool = QueryEngineTool(\n",
			
 
				+    "    index.as_query_engine(response_mode=\"tree_summarize\"),\n",
			
 
				+    "    metadata=ToolMetadata(\n",
			
 
				+    "        name=\"summary\",\n",
			
 
				+    "        description=\"Useful for summarizing an entire document.\",\n",
			
 
				+    "    ),\n",
			
 
				+    ")"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "d063d07b-c03e-4b26-8556-e3c058d2fd52",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "from llama_index.core.query_engine import RouterQueryEngine\n",
			
 
				+    "\n",
			
 
				+    "query_engine = RouterQueryEngine.from_defaults(\n",
			
 
				+    "    [vector_tool, summary_tool], select_multi=False, verbose=True, llm=llm_70b\n",
			
 
				+    ")\n",
			
 
				+    "\n",
			
 
				+    "response = query_engine.query(\n",
			
 
				+    "    \"Tell me about the song meet the grahams - why is it significant\"\n",
			
 
				+    ")"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "396aad75-5a71-4bd9-a760-7f13fe223079",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "print(response)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "a795f0bc-e871-4580-8983-6fb27d421fc5",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "## 4. Text-to-SQL \n",
			
 
				+    "\n",
			
 
				+    "Here, we download and use a sample SQLite database with 11 tables, with various info about music, playlists, and customers. We will limit to a select few tables for this test."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "a5096501-92c3-41af-a871-ade869d710fb",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "!wget \"https://www.sqlitetutorial.net/wp-content/uploads/2018/03/chinook.zip\" -O \"./data/chinook.zip\"\n",
			
 
				+    "!unzip \"./data/chinook.zip\""
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "d4db989e-c18d-4416-928e-7be4ead4d869",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "from sqlalchemy import (\n",
			
 
				+    "    create_engine,\n",
			
 
				+    "    MetaData,\n",
			
 
				+    "    Table,\n",
			
 
				+    "    Column,\n",
			
 
				+    "    String,\n",
			
 
				+    "    Integer,\n",
			
 
				+    "    select,\n",
			
 
				+    "    column,\n",
			
 
				+    ")\n",
			
 
				+    "\n",
			
 
				+    "engine = create_engine(\"sqlite:///chinook.db\")"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "bf6ed233-0ea3-4d4f-8c33-5b6d558b89b9",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "from llama_index.core import SQLDatabase\n",
			
 
				+    "\n",
			
 
				+    "sql_database = SQLDatabase(engine)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "debae423-1004-40f6-9356-e1c3add4d965",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "from llama_index.core.indices.struct_store import NLSQLTableQueryEngine\n",
			
 
				+    "\n",
			
 
				+    "query_engine = NLSQLTableQueryEngine(\n",
			
 
				+    "    sql_database=sql_database,\n",
			
 
				+    "    tables=[\"albums\", \"tracks\", \"artists\"],\n",
			
 
				+    "    llm=llm_70b,\n",
			
 
				+    ")"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "a65ecd70-09c4-4872-b712-3a8235d03db2",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "response = query_engine.query(\"What are some albums?\")\n",
			
 
				+    "\n",
			
 
				+    "print(response)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "c12b93ef-d6d1-4d15-9cb2-343070f72851",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "response = query_engine.query(\"What are some artists? Limit it to 5.\")\n",
			
 
				+    "\n",
			
 
				+    "print(response)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "2c243d38-c6ac-445c-b9d4-53a9ae013b7b",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "This last query should be a more complex join"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "553741c2-1050-445d-979a-ae2150ee3248",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "response = query_engine.query(\n",
			
 
				+    "    \"What are some tracks from the artist AC/DC? Limit it to 3\"\n",
			
 
				+    ")\n",
			
 
				+    "\n",
			
 
				+    "print(response)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "300689d7-9e67-4404-9898-27404ee6d4b5",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "print(response.metadata[\"sql_query\"])"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "1419fe67-aa6a-47db-88cd-9bb251c15615",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "## 5. Structured Data Extraction\n",
			
 
				+    "\n",
			
 
				+    "An important use case for function calling is extracting structured objects. LlamaIndex provides an intuitive interface for this through `structured_predict` - simply define the target Pydantic class (can be nested), and given a prompt, we extract out the desired object.\n",
			
 
				+    "\n",
			
 
				+    "**NOTE**: Since there's no native function calling support with Llama3, the structured extraction is performed by prompting the LLM + output parsing."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "4432f35a-5f29-45e9-a928-32e6d77b158e",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "from llama_index.llms.groq import Groq\n",
			
 
				+    "from llama_index.core.prompts import PromptTemplate\n",
			
 
				+    "from pydantic import BaseModel\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "class Restaurant(BaseModel):\n",
			
 
				+    "    \"\"\"A restaurant with name, city, and cuisine.\"\"\"\n",
			
 
				+    "\n",
			
 
				+    "    name: str\n",
			
 
				+    "    city: str\n",
			
 
				+    "    cuisine: str\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "llm = Groq(model=\"llama3-8b-8192\", pydantic_program_mode=\"llm\")\n",
			
 
				+    "prompt_tmpl = PromptTemplate(\n",
			
 
				+    "    \"Generate a restaurant in a given city {city_name}\"\n",
			
 
				+    ")"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "2c451f52-a051-4ba2-a683-0c1fd258d986",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "restaurant_obj = llm.structured_predict(\n",
			
 
				+    "    Restaurant, prompt_tmpl, city_name=\"Miami\"\n",
			
 
				+    ")\n",
			
 
				+    "print(restaurant_obj)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "839018a9-b65f-4824-83f7-2e4e52b55c5d",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "## 6. Adding Chat History to RAG (Chat Engine)\n",
			
 
				+    "\n",
			
 
				+    "In this section we create a stateful chatbot from a RAG pipeline, with our chat engine abstraction.\n",
			
 
				+    "\n",
			
 
				+    "Unlike a stateless query engine, the chat engine maintains conversation history (through a memory module like buffer memory). It performs retrieval given a condensed question, and feeds the condensed question + context + chat history into the final LLM prompt.\n",
			
 
				+    "\n",
			
 
				+    "Related resource: https://docs.llamaindex.ai/en/stable/examples/chat_engine/chat_engine_condense_plus_context/"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "27e56315-9513-4b32-bf9a-ce97c3ab52df",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "from llama_index.core.memory import ChatMemoryBuffer\n",
			
 
				+    "from llama_index.core.chat_engine import CondensePlusContextChatEngine\n",
			
 
				+    "\n",
			
 
				+    "memory = ChatMemoryBuffer.from_defaults(token_limit=3900)\n",
			
 
				+    "\n",
			
 
				+    "chat_engine = CondensePlusContextChatEngine.from_defaults(\n",
			
 
				+    "    index.as_retriever(),\n",
			
 
				+    "    memory=memory,\n",
			
 
				+    "    llm=llm,\n",
			
 
				+    "    context_prompt=(\n",
			
 
				+    "        \"You are a chatbot, able to have normal interactions, as well as talk\"\n",
			
 
				+    "        \" about the Kendrick and Drake beef.\"\n",
			
 
				+    "        \"Here are the relevant documents for the context:\\n\"\n",
			
 
				+    "        \"{context_str}\"\n",
			
 
				+    "        \"\\nInstruction: Use the previous chat history, or the context above, to interact and help the user.\"\n",
			
 
				+    "    ),\n",
			
 
				+    "    verbose=True,\n",
			
 
				+    ")"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "b24524d2-fdce-4237-8ecc-67f139302303",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "response = chat_engine.chat(\n",
			
 
				+    "    \"Tell me about the songs Drake released in the beef.\"\n",
			
 
				+    ")\n",
			
 
				+    "print(str(response))"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "f9a87a16-2864-4c48-95e7-a2103e119242",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "response = chat_engine.chat(\"What about Kendrick?\")\n",
			
 
				+    "print(str(response))"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "a7fa07ed-58f0-445e-bbd3-4ad8bac6598e",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "## 7. Agents\n",
			
 
				+    "\n",
			
 
				+    "Here we build agents with Llama 3. We perform RAG over simple functions as well as the documents above."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "aa98d735-5d43-413f-aab3-fc3adeed81b1",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### Agents And Tools"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "fb73a01f-8a2e-4dd6-91f8-710c92b81c56",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "import json\n",
			
 
				+    "from typing import Sequence, List\n",
			
 
				+    "\n",
			
 
				+    "from llama_index.core.llms import ChatMessage\n",
			
 
				+    "from llama_index.core.tools import BaseTool, FunctionTool\n",
			
 
				+    "from llama_index.core.agent import ReActAgent\n",
			
 
				+    "\n",
			
 
				+    "import nest_asyncio\n",
			
 
				+    "\n",
			
 
				+    "nest_asyncio.apply()"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "efbee832-9786-4551-93f2-01ee90fa0f4d",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### Define Tools"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "b2058b36-8053-4dc8-9218-c286702ecf66",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "def multiply(a: int, b: int) -> int:\n",
			
 
				+    "    \"\"\"Multiple two integers and returns the result integer\"\"\"\n",
			
 
				+    "    return a * b\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "def add(a: int, b: int) -> int:\n",
			
 
				+    "    \"\"\"Add two integers and returns the result integer\"\"\"\n",
			
 
				+    "    return a + b\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "def subtract(a: int, b: int) -> int:\n",
			
 
				+    "    \"\"\"Subtract two integers and returns the result integer\"\"\"\n",
			
 
				+    "    return a - b\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "def divide(a: int, b: int) -> int:\n",
			
 
				+    "    \"\"\"Divides two integers and returns the result integer\"\"\"\n",
			
 
				+    "    return a / b\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "multiply_tool = FunctionTool.from_defaults(fn=multiply)\n",
			
 
				+    "add_tool = FunctionTool.from_defaults(fn=add)\n",
			
 
				+    "subtract_tool = FunctionTool.from_defaults(fn=subtract)\n",
			
 
				+    "divide_tool = FunctionTool.from_defaults(fn=divide)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "22d7d4dc-e2ce-402c-9350-0e7010d0080c",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### ReAct Agent"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "72a48053-e30d-4884-bcac-80752047d940",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "agent = ReActAgent.from_tools(\n",
			
 
				+    "    [multiply_tool, add_tool, subtract_tool, divide_tool],\n",
			
 
				+    "    llm=llm_70b,\n",
			
 
				+    "    verbose=True,\n",
			
 
				+    ")"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "7ada828a-3b05-4fc1-90e8-986c5607ae61",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### Querying"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "9c0b1e56-d9f7-4615-a15a-c91fea1adb00",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "response = agent.chat(\"What is (121 + 2) * 5?\")\n",
			
 
				+    "print(str(response))"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "67ce45f6-bdd4-42aa-8f74-43a50f14094e",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### ReAct Agent With RAG QueryEngine Tools"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "97fce5f1-eacf-4ecc-9e83-072e74d3a2a9",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "from llama_index.core import (\n",
			
 
				+    "    SimpleDirectoryReader,\n",
			
 
				+    "    VectorStoreIndex,\n",
			
 
				+    "    StorageContext,\n",
			
 
				+    "    load_index_from_storage,\n",
			
 
				+    ")\n",
			
 
				+    "\n",
			
 
				+    "from llama_index.core.tools import QueryEngineTool, ToolMetadata"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "23963d00-e3d2-4ce1-9ac3-aa486bf4b1a5",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### Create ReAct Agent using RAG QueryEngine Tools"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "1844dbbd-477c-4c4d-bb18-2c2e16a75a50",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "This may take 4 minutes to run:"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "66ab1e60-3374-4eb9-b7dc-c28db3b47c51",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "drake_index = VectorStoreIndex.from_documents(docs_drake)\n",
			
 
				+    "drake_query_engine = drake_index.as_query_engine(similarity_top_k=3)\n",
			
 
				+    "\n",
			
 
				+    "kendrick_index = VectorStoreIndex.from_documents(docs_kendrick)\n",
			
 
				+    "kendrick_query_engine = kendrick_index.as_query_engine(similarity_top_k=3)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "0e241fe9-f390-4be5-b3c4-da4f56db01ef",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "drake_tool = QueryEngineTool(\n",
			
 
				+    "    drake_index.as_query_engine(),\n",
			
 
				+    "    metadata=ToolMetadata(\n",
			
 
				+    "        name=\"drake_search\",\n",
			
 
				+    "        description=\"Useful for searching over Drake's life.\",\n",
			
 
				+    "    ),\n",
			
 
				+    ")\n",
			
 
				+    "\n",
			
 
				+    "kendrick_tool = QueryEngineTool(\n",
			
 
				+    "    kendrick_index.as_query_engine(),\n",
			
 
				+    "    metadata=ToolMetadata(\n",
			
 
				+    "        name=\"kendrick_search\",\n",
			
 
				+    "        description=\"Useful for searching over Kendrick's life.\",\n",
			
 
				+    "    ),\n",
			
 
				+    ")\n",
			
 
				+    "\n",
			
 
				+    "query_engine_tools = [drake_tool, kendrick_tool]"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "b922feac-b221-4737-92c6-e63eeab4eab7",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "agent = ReActAgent.from_tools(\n",
			
 
				+    "    query_engine_tools,\n",
			
 
				+    "    llm=llm_70b,\n",
			
 
				+    "    verbose=True,\n",
			
 
				+    ")"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "7e38edc8-47f8-4f1a-ad87-bc3a9e31a65e",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "### Querying"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "035c2c8b-5a5e-4df0-a423-4c2d6054f457",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "response = agent.chat(\"Tell me about how Kendrick and Drake grew up\")\n",
			
 
				+    "print(str(response))"
			
 
				+   ]
			
 
				+  }
			
 
				+ ],
			
 
				+ "metadata": {
			
 
				+  "kernelspec": {
			
 
				+   "display_name": "Python 3 (ipykernel)",
			
 
				+   "language": "python",
			
 
				+   "name": "python3"
			
 
				+  },
			
 
				+  "language_info": {
			
 
				+   "codemirror_mode": {
			
 
				+    "name": "ipython",
			
 
				+    "version": 3
			
 
				+   },
			
 
				+   "file_extension": ".py",
			
 
				+   "mimetype": "text/x-python",
			
 
				+   "name": "python",
			
 
				+   "nbconvert_exporter": "python",
			
 
				+   "pygments_lexer": "ipython3",
			
 
				+   "version": "3.10.14"
			
 
				+  }
			
 
				+ },
			
 
				+ "nbformat": 4,
			
 
				+ "nbformat_minor": 5
			
 
				+}
			
--- a/recipes/quickstart/Getting_to_know_Llama.ipynb
+++ b/recipes/quickstart/Getting_to_know_Llama.ipynb
--- a/recipes/quickstart/Prompt_Engineering_with_Llama_2.ipynb
+++ b/recipes/quickstart/Prompt_Engineering_with_Llama_2.ipynb
@@ -5,11 +5,13 @@
 
				    "cell_type": "markdown",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				-    "# Prompt Engineering with Llama 2\n",
			
 
				+    "<a href=\"https://colab.research.google.com/github/meta-llama/llama-recipes/blob/main/recipes/quickstart/Prompt_Engineering_with_Llama_3.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
			
 
				+    "\n",
			
 
				+    "# Prompt Engineering with Llama 3\n",
			
 
				     "\n",
			
 
				     "Prompt engineering is using natural language to produce a desired response from a large language model (LLM).\n",
			
 
				     "\n",
			
 
				-    "This interactive guide covers prompt engineering & best practices with Llama 2."
			
 
				+    "This interactive guide covers prompt engineering & best practices with Llama 3."
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -41,7 +43,13 @@
 
				     "\n",
			
 
				     "In 2023, Meta introduced the [Llama language models](https://ai.meta.com/llama/) (Llama Chat, Code Llama, Llama Guard). These are general purpose, state-of-the-art LLMs.\n",
			
 
				     "\n",
			
 
				-    "Llama 2 models come in 7 billion, 13 billion, and 70 billion parameter sizes. Smaller models are cheaper to deploy and run (see: deployment and performance); larger models are more capable.\n",
			
 
				+    "Llama models come in varying parameter sizes. The smaller models are cheaper to deploy and run; the larger models are more capable.\n",
			
 
				+    "\n",
			
 
				+    "#### Llama 3\n",
			
 
				+    "1. `llama-3-8b` - base pretrained 8 billion parameter model\n",
			
 
				+    "1. `llama-3-70b` - base pretrained 70 billion parameter model\n",
			
 
				+    "1. `llama-3-8b-instruct` - instruction fine-tuned 8 billion parameter model\n",
			
 
				+    "1. `llama-3-70b-instruct` - instruction fine-tuned 70 billion parameter model (flagship)\n",
			
 
				     "\n",
			
 
				     "#### Llama 2\n",
			
 
				     "1. `llama-2-7b` - base pretrained 7 billion parameter model\n",
			
@@ -69,12 +77,15 @@
 
				     "1. `codellama-7b` - code fine-tuned 7 billion parameter model\n",
			
 
				     "1. `codellama-13b` - code fine-tuned 13 billion parameter model\n",
			
 
				     "1. `codellama-34b` - code fine-tuned 34 billion parameter model\n",
			
 
				+    "1. `codellama-70b` - code fine-tuned 70 billion parameter model\n",
			
 
				     "1. `codellama-7b-instruct` - code & instruct fine-tuned 7 billion parameter model\n",
			
 
				     "2. `codellama-13b-instruct` - code & instruct fine-tuned 13 billion parameter model\n",
			
 
				     "3. `codellama-34b-instruct` - code & instruct fine-tuned 34 billion parameter model\n",
			
 
				+    "3. `codellama-70b-instruct` - code & instruct fine-tuned 70 billion parameter model\n",
			
 
				     "1. `codellama-7b-python` - Python fine-tuned 7 billion parameter model\n",
			
 
				     "2. `codellama-13b-python` - Python fine-tuned 13 billion parameter model\n",
			
 
				-    "3. `codellama-34b-python` - Python fine-tuned 34 billion parameter model"
			
 
				+    "3. `codellama-34b-python` - Python fine-tuned 34 billion parameter model\n",
			
 
				+    "3. `codellama-70b-python` - Python fine-tuned 70 billion parameter model"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -86,11 +97,11 @@
 
				     "\n",
			
 
				     "Large language models are deployed and accessed in a variety of ways, including:\n",
			
 
				     "\n",
			
 
				-    "1. **Self-hosting**: Using local hardware to run inference. Ex. running Llama 2 on your Macbook Pro using [llama.cpp](https://github.com/ggerganov/llama.cpp).\n",
			
 
				+    "1. **Self-hosting**: Using local hardware to run inference. Ex. running Llama on your Macbook Pro using [llama.cpp](https://github.com/ggerganov/llama.cpp).\n",
			
 
				     "    * Best for privacy/security or if you already have a GPU.\n",
			
 
				-    "1. **Cloud hosting**: Using a cloud provider to deploy an instance that hosts a specific model. Ex. running Llama 2 on cloud providers like AWS, Azure, GCP, and others.\n",
			
 
				+    "1. **Cloud hosting**: Using a cloud provider to deploy an instance that hosts a specific model. Ex. running Llama on cloud providers like AWS, Azure, GCP, and others.\n",
			
 
				     "    * Best for customizing models and their runtime (ex. fine-tuning a model for your use case).\n",
			
 
				-    "1. **Hosted API**: Call LLMs directly via an API. There are many companies that provide Llama 2 inference APIs including AWS Bedrock, Replicate, Anyscale, Together and others.\n",
			
 
				+    "1. **Hosted API**: Call LLMs directly via an API. There are many companies that provide Llama inference APIs including AWS Bedrock, Replicate, Anyscale, Together and others.\n",
			
 
				     "    * Easiest option overall."
			
 
				    ]
			
 
				   },
			
@@ -118,11 +129,11 @@
 
				     "\n",
			
 
				     "> Our destiny is written in the stars.\n",
			
 
				     "\n",
			
 
				-    "...is tokenized into `[\"our\", \"dest\", \"iny\", \"is\", \"written\", \"in\", \"the\", \"stars\"]` for Llama 2.\n",
			
 
				+    "...is tokenized into `[\"Our\", \" destiny\", \" is\", \" written\", \" in\", \" the\", \" stars\", \".\"]` for Llama 3. See [this](https://tiktokenizer.vercel.app/?model=meta-llama%2FMeta-Llama-3-8B) for an interactive tokenizer tool.\n",
			
 
				     "\n",
			
 
				     "Tokens matter most when you consider API pricing and internal behavior (ex. hyperparameters).\n",
			
 
				     "\n",
			
 
				-    "Each model has a maximum context length that your prompt cannot exceed. That's 4096 tokens for Llama 2 and 100K for Code Llama. \n"
			
 
				+    "Each model has a maximum context length that your prompt cannot exceed. That's 8K tokens for Llama 3, 4K for Llama 2, and 100K for Code Llama. \n"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -132,7 +143,7 @@
 
				    "source": [
			
 
				     "## Notebook Setup\n",
			
 
				     "\n",
			
 
				-    "The following APIs will be used to call LLMs throughout the guide. As an example, we'll call Llama 2 chat using [Replicate](https://replicate.com/meta/llama-2-70b-chat) and use LangChain to easily set up a chat completion API.\n",
			
 
				+    "The following APIs will be used to call LLMs throughout the guide. As an example, we'll call Llama 3 chat using [Grok](https://console.groq.com/playground?model=llama3-70b-8192).\n",
			
 
				     "\n",
			
 
				     "To install prerequisites run:"
			
 
				    ]
			
@@ -143,7 +154,8 @@
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "pip install langchain replicate"
			
 
				+    "import sys\n",
			
 
				+    "!{sys.executable} -m pip install groq"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -152,64 +164,54 @@
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "from typing import Dict, List\n",
			
 
				-    "from langchain.llms import Replicate\n",
			
 
				-    "from langchain.memory import ChatMessageHistory\n",
			
 
				-    "from langchain.schema.messages import get_buffer_string\n",
			
 
				     "import os\n",
			
 
				+    "from typing import Dict, List\n",
			
 
				+    "from groq import Groq\n",
			
 
				     "\n",
			
 
				-    "# Get a free API key from https://replicate.com/account/api-tokens\n",
			
 
				-    "os.environ[\"REPLICATE_API_TOKEN\"] = \"YOUR_KEY_HERE\"\n",
			
 
				+    "# Get a free API key from https://console.groq.com/keys\n",
			
 
				+    "os.environ[\"GROQ_API_KEY\"] = \"YOUR_GROQ_API_KEY\"\n",
			
 
				     "\n",
			
 
				-    "LLAMA2_70B_CHAT = \"meta/llama-2-70b-chat:2d19859030ff705a87c746f7e96eea03aefb71f166725aee39692f1476566d48\"\n",
			
 
				-    "LLAMA2_13B_CHAT = \"meta/llama-2-13b-chat:f4e2de70d66816a838a89eeeb621910adffb0dd0baba3976c96980970978018d\"\n",
			
 
				+    "LLAMA3_70B_INSTRUCT = \"llama3-70b-8192\"\n",
			
 
				+    "LLAMA3_8B_INSTRUCT = \"llama3-8b-8192\"\n",
			
 
				     "\n",
			
 
				-    "# We'll default to the smaller 13B model for speed; change to LLAMA2_70B_CHAT for more advanced (but slower) generations\n",
			
 
				-    "DEFAULT_MODEL = LLAMA2_13B_CHAT\n",
			
 
				+    "DEFAULT_MODEL = LLAMA3_70B_INSTRUCT\n",
			
 
				     "\n",
			
 
				-    "def completion(\n",
			
 
				-    "    prompt: str,\n",
			
 
				-    "    model: str = DEFAULT_MODEL,\n",
			
 
				+    "client = Groq()\n",
			
 
				+    "\n",
			
 
				+    "def assistant(content: str):\n",
			
 
				+    "    return { \"role\": \"assistant\", \"content\": content }\n",
			
 
				+    "\n",
			
 
				+    "def user(content: str):\n",
			
 
				+    "    return { \"role\": \"user\", \"content\": content }\n",
			
 
				+    "\n",
			
 
				+    "def chat_completion(\n",
			
 
				+    "    messages: List[Dict],\n",
			
 
				+    "    model = DEFAULT_MODEL,\n",
			
 
				     "    temperature: float = 0.6,\n",
			
 
				     "    top_p: float = 0.9,\n",
			
 
				     ") -> str:\n",
			
 
				-    "    llm = Replicate(\n",
			
 
				+    "    response = client.chat.completions.create(\n",
			
 
				+    "        messages=messages,\n",
			
 
				     "        model=model,\n",
			
 
				-    "        model_kwargs={\"temperature\": temperature,\"top_p\": top_p, \"max_new_tokens\": 1000}\n",
			
 
				+    "        temperature=temperature,\n",
			
 
				+    "        top_p=top_p,\n",
			
 
				     "    )\n",
			
 
				-    "    return llm(prompt)\n",
			
 
				+    "    return response.choices[0].message.content\n",
			
 
				+    "        \n",
			
 
				     "\n",
			
 
				-    "def chat_completion(\n",
			
 
				-    "    messages: List[Dict],\n",
			
 
				-    "    model = DEFAULT_MODEL,\n",
			
 
				+    "def completion(\n",
			
 
				+    "    prompt: str,\n",
			
 
				+    "    model: str = DEFAULT_MODEL,\n",
			
 
				     "    temperature: float = 0.6,\n",
			
 
				     "    top_p: float = 0.9,\n",
			
 
				     ") -> str:\n",
			
 
				-    "    history = ChatMessageHistory()\n",
			
 
				-    "    for message in messages:\n",
			
 
				-    "        if message[\"role\"] == \"user\":\n",
			
 
				-    "            history.add_user_message(message[\"content\"])\n",
			
 
				-    "        elif message[\"role\"] == \"assistant\":\n",
			
 
				-    "            history.add_ai_message(message[\"content\"])\n",
			
 
				-    "        else:\n",
			
 
				-    "            raise Exception(\"Unknown role\")\n",
			
 
				-    "    return completion(\n",
			
 
				-    "        get_buffer_string(\n",
			
 
				-    "            history.messages,\n",
			
 
				-    "            human_prefix=\"USER\",\n",
			
 
				-    "            ai_prefix=\"ASSISTANT\",\n",
			
 
				-    "        ),\n",
			
 
				-    "        model,\n",
			
 
				-    "        temperature,\n",
			
 
				-    "        top_p,\n",
			
 
				+    "    return chat_completion(\n",
			
 
				+    "        [user(prompt)],\n",
			
 
				+    "        model=model,\n",
			
 
				+    "        temperature=temperature,\n",
			
 
				+    "        top_p=top_p,\n",
			
 
				     "    )\n",
			
 
				     "\n",
			
 
				-    "def assistant(content: str):\n",
			
 
				-    "    return { \"role\": \"assistant\", \"content\": content }\n",
			
 
				-    "\n",
			
 
				-    "def user(content: str):\n",
			
 
				-    "    return { \"role\": \"user\", \"content\": content }\n",
			
 
				-    "\n",
			
 
				     "def complete_and_print(prompt: str, model: str = DEFAULT_MODEL):\n",
			
 
				     "    print(f'==============\\n{prompt}\\n==============')\n",
			
 
				     "    response = completion(prompt, model)\n",
			
@@ -223,7 +225,7 @@
 
				    "source": [
			
 
				     "### Completion APIs\n",
			
 
				     "\n",
			
 
				-    "Llama 2 models tend to be wordy and explain their rationale. Later we'll explore how to manage the response length."
			
 
				+    "Let's try Llama 3!"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -345,7 +347,7 @@
 
				    "cell_type": "markdown",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				-    "You can think about giving explicit instructions as using rules and restrictions to how Llama 2 responds to your prompt.\n",
			
 
				+    "You can think about giving explicit instructions as using rules and restrictions to how Llama 3 responds to your prompt.\n",
			
 
				     "\n",
			
 
				     "- Stylization\n",
			
 
				     "    - `Explain this to me like a topic on a children's educational network show teaching elementary students.`\n",
			
@@ -387,9 +389,9 @@
 
				     "\n",
			
 
				     "#### Zero-Shot Prompting\n",
			
 
				     "\n",
			
 
				-    "Large language models like Llama 2 are unique because they are capable of following instructions and producing responses without having previously seen an example of a task. Prompting without examples is called \"zero-shot prompting\".\n",
			
 
				+    "Large language models like Llama 3 are unique because they are capable of following instructions and producing responses without having previously seen an example of a task. Prompting without examples is called \"zero-shot prompting\".\n",
			
 
				     "\n",
			
 
				-    "Let's try using Llama 2 as a sentiment detector. You may notice that output format varies - we can improve this with better prompting."
			
 
				+    "Let's try using Llama 3 as a sentiment detector. You may notice that output format varies - we can improve this with better prompting."
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -459,9 +461,9 @@
 
				    "source": [
			
 
				     "### Role Prompting\n",
			
 
				     "\n",
			
 
				-    "Llama 2 will often give more consistent responses when given a role ([Kong et al. (2023)](https://browse.arxiv.org/pdf/2308.07702.pdf)). Roles give context to the LLM on what type of answers are desired.\n",
			
 
				+    "Llama will often give more consistent responses when given a role ([Kong et al. (2023)](https://browse.arxiv.org/pdf/2308.07702.pdf)). Roles give context to the LLM on what type of answers are desired.\n",
			
 
				     "\n",
			
 
				-    "Let's use Llama 2 to create a more focused, technical response for a question around the pros and cons of using PyTorch."
			
 
				+    "Let's use Llama 3 to create a more focused, technical response for a question around the pros and cons of using PyTorch."
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -484,7 +486,9 @@
 
				    "source": [
			
 
				     "### Chain-of-Thought\n",
			
 
				     "\n",
			
 
				-    "Simply adding a phrase encouraging step-by-step thinking \"significantly improves the ability of large language models to perform complex reasoning\" ([Wei et al. (2022)](https://arxiv.org/abs/2201.11903)). This technique is called \"CoT\" or \"Chain-of-Thought\" prompting:"
			
 
				+    "Simply adding a phrase encouraging step-by-step thinking \"significantly improves the ability of large language models to perform complex reasoning\" ([Wei et al. (2022)](https://arxiv.org/abs/2201.11903)). This technique is called \"CoT\" or \"Chain-of-Thought\" prompting.\n",
			
 
				+    "\n",
			
 
				+    "Llama 3 now reasons step-by-step naturally without the addition of the phrase. This section remains for completeness."
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -493,10 +497,12 @@
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "complete_and_print(\"Who lived longer Elvis Presley or Mozart?\")\n",
			
 
				-    "# Often gives incorrect answer of \"Mozart\"\n",
			
 
				+    "prompt = \"Who lived longer, Mozart or Elvis?\"\n",
			
 
				+    "\n",
			
 
				+    "complete_and_print(prompt)\n",
			
 
				+    "# Llama 2 would often give the incorrect answer of \"Mozart\"\n",
			
 
				     "\n",
			
 
				-    "complete_and_print(\"Who lived longer Elvis Presley or Mozart? Let's think through this carefully, step by step.\")\n",
			
 
				+    "complete_and_print(f\"{prompt} Let's think through this carefully, step by step.\")\n",
			
 
				     "# Gives the correct answer \"Elvis\""
			
 
				    ]
			
 
				   },
			
@@ -523,10 +529,9 @@
 
				     "    response = completion(\n",
			
 
				     "        \"John found that the average of 15 numbers is 40.\"\n",
			
 
				     "        \"If 10 is added to each number then the mean of the numbers is?\"\n",
			
 
				-    "        \"Report the answer surrounded by three backticks, for example: ```123```\",\n",
			
 
				-    "        model = LLAMA2_70B_CHAT\n",
			
 
				+    "        \"Report the answer surrounded by backticks (example: `123`)\",\n",
			
 
				     "    )\n",
			
 
				-    "    match = re.search(r'```(\\d+)```', response)\n",
			
 
				+    "    match = re.search(r'`(\\d+)`', response)\n",
			
 
				     "    if match is None:\n",
			
 
				     "        return None\n",
			
 
				     "    return match.group(1)\n",
			
@@ -538,10 +543,10 @@
 
				     "    f\"Final answer: {mode(answers)}\",\n",
			
 
				     "    )\n",
			
 
				     "\n",
			
 
				-    "# Sample runs of Llama-2-70B (all correct):\n",
			
 
				-    "# [50, 50, 750, 50, 50]  -> 50\n",
			
 
				-    "# [130, 10, 750, 50, 50] -> 50\n",
			
 
				-    "# [50, None, 10, 50, 50] -> 50"
			
 
				+    "# Sample runs of Llama-3-70B (all correct):\n",
			
 
				+    "# ['60', '50', '50', '50', '50'] -> 50\n",
			
 
				+    "# ['50', '50', '50', '60', '50'] -> 50\n",
			
 
				+    "# ['50', '50', '60', '50', '50'] -> 50"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -560,7 +565,7 @@
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "complete_and_print(\"What is the capital of the California?\", model = LLAMA2_70B_CHAT)\n",
			
 
				+    "complete_and_print(\"What is the capital of the California?\")\n",
			
 
				     "# Gives the correct answer \"Sacramento\""
			
 
				    ]
			
 
				   },
			
@@ -677,7 +682,6 @@
 
				     "    \"\"\"\n",
			
 
				     "    # Python code to calculate: ((-5 + 93 * 4 - 0) * (4^4 + -7 + 0 * 5))\n",
			
 
				     "    \"\"\",\n",
			
 
				-    "    model=\"meta/codellama-34b:67942fd0f55b66da802218a19a8f0e1d73095473674061a6ea19f2dc8c053152\"\n",
			
 
				     ")"
			
 
				    ]
			
 
				   },
			
@@ -687,12 +691,10 @@
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "# The following code was generated by Code Llama 34B:\n",
			
 
				+    "# The following code was generated by Llama 3 70B:\n",
			
 
				     "\n",
			
 
				-    "num1 = (-5 + 93 * 4 - 0)\n",
			
 
				-    "num2 = (4**4 + -7 + 0 * 5)\n",
			
 
				-    "answer = num1 * num2\n",
			
 
				-    "print(answer)"
			
 
				+    "result = ((-5 + 93 * 4 - 0) * (4**4 - 7 + 0 * 5))\n",
			
 
				+    "print(result)"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -702,7 +704,7 @@
 
				    "source": [
			
 
				     "### Limiting Extraneous Tokens\n",
			
 
				     "\n",
			
 
				-    "A common struggle is getting output without extraneous tokens (ex. \"Sure! Here's more information on...\").\n",
			
 
				+    "A common struggle with Llama 2 is getting output without extraneous tokens (ex. \"Sure! Here's more information on...\"), even if explicit instructions are given to Llama 2 to be concise and no preamble. Llama 3 can better follow instructions.\n",
			
 
				     "\n",
			
 
				     "Check out this improvement that combines a role, rules and restrictions, explicit instructions, and an example:"
			
 
				    ]
			
@@ -715,7 +717,6 @@
 
				    "source": [
			
 
				     "complete_and_print(\n",
			
 
				     "    \"Give me the zip code for Menlo Park in JSON format with the field 'zip_code'\",\n",
			
 
				-    "    model = LLAMA2_70B_CHAT,\n",
			
 
				     ")\n",
			
 
				     "# Likely returns the JSON and also \"Sure! Here's the JSON...\"\n",
			
 
				     "\n",
			
@@ -726,7 +727,6 @@
 
				     "    Example question: What is the zip code of the Empire State Building? Example answer: {'zip_code': 10118}\n",
			
 
				     "    Now here is my question: What is the zip code of Menlo Park?\n",
			
 
				     "    \"\"\",\n",
			
 
				-    "    model = LLAMA2_70B_CHAT,\n",
			
 
				     ")\n",
			
 
				     "# \"{'zip_code': 94025}\""
			
 
				    ]
			
@@ -770,7 +770,8 @@
 
				    "mimetype": "text/x-python",
			
 
				    "name": "python",
			
 
				    "nbconvert_exporter": "python",
			
 
				-   "pygments_lexer": "ipython3"
			
 
				+   "pygments_lexer": "ipython3",
			
 
				+   "version": "3.10.14"
			
 
				   },
			
 
				   "last_base_url": "https://bento.edge.x2p.facebook.net/",
			
 
				   "last_kernel_id": "161e2a7b-2d2b-4995-87f3-d1539860ecac",
			
--- a/recipes/use_cases/LiveData.ipynb
+++ b/recipes/use_cases/LiveData.ipynb
--- a/recipes/use_cases/MediaGen.ipynb
+++ b/recipes/use_cases/MediaGen.ipynb
--- a/recipes/use_cases/README.md
+++ b/recipes/use_cases/README.md
@@ -14,4 +14,10 @@ This step-by-step tutorial shows how to use the [WhatsApp Business API](https://
 
				 This step-by-step tutorial shows how to use the [Messenger Platform](https://developers.facebook.com/docs/messenger-platform/overview) to build a Llama 3 enabled Messenger chatbot.
			
 
				 
			
 
				 ### RAG Chatbot Example (running [locally](./chatbots/RAG_chatbot/RAG_Chatbot_Example.ipynb) or on [OctoAI](../llama_api_providers/OctoAI_API_examples/RAG_Chatbot_example/RAG_Chatbot_Example.ipynb))
			
 
				-A complete example of how to build a Llama 3 chatbot hosted on your browser that can answer questions based on your own data using retrieval augmented generation (RAG). You can run Llama2 locally if you have a good enough GPU or on OctoAI if you follow the note [here](../README.md#octoai_note).
			
 
				+A complete example of how to build a Llama 3 chatbot hosted on your browser that can answer questions based on your own data using retrieval augmented generation (RAG). You can run Llama2 locally if you have a good enough GPU or on OctoAI if you follow the note [here](../README.md#octoai_note).
			
 
				+
			
 
				+## [Sales Bot](./chatbots/sales_bot/SalesBot.ipynb): Sales Bot with Llama3 - A Summarization and RAG Use Case
			
 
				+An summarization + RAG use case built around the Amazon product review Kaggle dataset to build a helpful Music Store Sales Bot. The summarization and RAG are built on top of Llama models hosted on OctoAI, and the vector database is hosted on Weaviate Cloud Services.
			
 
				+
			
 
				+## [Media Generation](./MediaGen.ipynb): Building a Video Generation Pipeline with Llama3
			
 
				+This step-by-step tutorial shows how to use leverage Llama 3 to drive the generation of animated videos using SDXL and SVD. More specifically it relies on JSON formatting to produce a scene-by-scene story board of a recipe video. The user provides the name of a dish, then Llama 3 describes a step by step guide to reproduce the said dish. This step by step guide is brought to life with models like SDXL and SVD.
			
--- a/recipes/use_cases/agents/langchain/README.md
+++ b/recipes/use_cases/agents/langchain/README.md
@@ -0,0 +1,70 @@
 
				+# LangChain <> Llama3 Cookbooks
			
 
				+
			
 
				+LLM agents use [planning, memory, and tools](https://lilianweng.github.io/posts/2023-06-23-agent/) to accomplish tasks. Agents can empower Llama 3 with important new capabilities. Here, we will show how to give Llama 3 the ability to perform web search, as well as multi-modality: image generation (text-to-image), image analysis (image-to-text), and voice (text-to-speech) tools!
			
 
				+
			
 
				+LangChain offers several different ways to implement agents with Llama 3:
			
 
				+
			
 
				+(1) `ReAct agent` - Uses [AgentExecutor](https://python.langchain.com/docs/modules/agents/quick_start/) with [tool-calling](https://python.langchain.com/docs/integrations/chat/) versions of Llama 3.
			
 
				+
			
 
				+(2) `LangGraph tool calling agent` - Uses [LangGraph](https://python.langchain.com/docs/langgraph) with [tool-calling](https://python.langchain.com/docs/integrations/chat/) versions of Llama 3.
			
 
				+
			
 
				+(3) `LangGraph custom agent` - Uses [LangGraph](https://python.langchain.com/docs/langgraph) with **any** version of Llama 3 (so long as it supports structured output).
			
 
				+
			
 
				+As we move from option (1) to (3) the degree of customization and flexibility increases:
			
 
				+
			
 
				+(1) `ReAct agent` using AgentExecutor is a great for getting started quickly with minimal code, but requires a version of Llama 3 with reliable tool-calling, is the least customizable, and uses higher-level AgentExecutor abstraction.
			
 
				+  
			
 
				+(2) `LangGraph tool calling agent` is more customizable than (1) because the LLM assistant (planning) and tool call (action) nodes are defined by the user, but it still requires a version of Llama 3 with reliable tool-calling.
			
 
				+  
			
 
				+(3) `LangGraph custom agent` does not require a version of Llama 3 with reliable tool-calling and is the most customizable, but requires the most work to implement. 
			
 
				+
			
 
				+![langgraph_agent_architectures](https://github.com/rlancemartin/llama-recipes/assets/122662504/5ed2bef0-ae11-4efa-9e88-ab560a4d0022)
			
 
				+
			
 
				+---
			
 
				+
			
 
				+### `ReAct agent`
			
 
				+
			
 
				+The AgentExecutor manages the loop of planning, executing tool calls, and processing outputs until an AgentFinish signal is generated, indicating task completion.
			
 
				+
			
 
				+Our first notebook, `tool-calling-agent`, shows how to build a [tool calling agent](https://python.langchain.com/docs/modules/agents/agent_types/tool_calling/) with AgentExecutor and Llama 3.
			
 
				+
			
 
				+--- 
			
 
				+
			
 
				+### `LangGraph tool calling agent`
			
 
				+
			
 
				+[LangGraph](https://python.langchain.com/docs/langgraph) is a library from LangChain that can be used to build reliable agents.
			
 
				+
			
 
				+Our second notebook, `langgraph-tool-calling-agent`, shows an alternative to AgentExecutor for building a Llama 3 powered agent. 
			
 
				+
			
 
				+--- 
			
 
				+
			
 
				+### `LangGraph custom agent`
			
 
				+
			
 
				+Our third notebook, `langgraph-custom-agent`, shows how to build a Llama 3 powered agent without reliance on tool-calling. 
			
 
				+
			
 
				+--- 
			
 
				+
			
 
				+### `LangGraph RAG Agent`
			
 
				+
			
 
				+Our fourth notebook, `langgraph-rag-agent`, shows how to apply LangGraph to build a custom Llama 3 powered RAG agent that use ideas from 3 papers:
			
 
				+
			
 
				+* Corrective-RAG (CRAG) [paper](https://arxiv.org/pdf/2401.15884.pdf) uses self-grading on retrieved documents and web-search fallback if documents are not relevant.
			
 
				+* Self-RAG [paper](https://arxiv.org/abs/2310.11511) adds self-grading on generations for hallucinations and for ability to answer the question.
			
 
				+* Adaptive RAG [paper](https://arxiv.org/abs/2403.14403) routes queries between different RAG approaches based on their complexity.
			
 
				+
			
 
				+We implement each approach as a control flow in LangGraph:
			
 
				+- **Planning:** The sequence of RAG steps (e.g., retrieval, grading, and generation) that we want the agent to take.
			
 
				+- **Memory:** All the RAG-related information (input question, retrieved documents, etc) that we want to pass between steps.
			
 
				+- **Tool use:** All the tools needed for RAG (e.g., decide web search or vectorstore retrieval based on the question).
			
 
				+
			
 
				+We will build from CRAG (blue, below) to Self-RAG (green) and finally to Adaptive RAG (red):
			
 
				+
			
 
				+![langgraph_rag_agent_](https://github.com/rlancemartin/llama-recipes/assets/122662504/ec4aa1cd-3c7e-4cd1-a1e7-7deddc4033a8)
			
 
				+
			
 
				+--- 
			
 
				+ 
			
 
				+### `Local LangGraph RAG Agent`
			
 
				+
			
 
				+Our fifth notebook, `langgraph-rag-agent-local`, shows how to apply LangGraph to build advanced RAG agents using Llama 3 that run locally and reliably.
			
 
				+
			
 
				+See this [video overview](https://www.youtube.com/watch?v=sgnrL7yo1TE) for more detail on the design of this agent.
			
--- a/recipes/use_cases/agents/langchain/langgraph-custom-agent.ipynb
+++ b/recipes/use_cases/agents/langchain/langgraph-custom-agent.ipynb
--- a/recipes/use_cases/agents/langchain/langgraph-rag-agent-local.ipynb
+++ b/recipes/use_cases/agents/langchain/langgraph-rag-agent-local.ipynb
--- a/recipes/use_cases/agents/langchain/langgraph-rag-agent.ipynb
+++ b/recipes/use_cases/agents/langchain/langgraph-rag-agent.ipynb
--- a/recipes/use_cases/agents/langchain/langgraph-tool-calling-agent.ipynb
+++ b/recipes/use_cases/agents/langchain/langgraph-tool-calling-agent.ipynb
--- a/recipes/use_cases/agents/langchain/tool-calling-agent.ipynb
+++ b/recipes/use_cases/agents/langchain/tool-calling-agent.ipynb
--- a/recipes/use_cases/chatbots/RAG_chatbot/RAG_Chatbot_Example.ipynb
+++ b/recipes/use_cases/chatbots/RAG_chatbot/RAG_Chatbot_Example.ipynb
--- a/recipes/use_cases/chatbots/RAG_chatbot/vectorstore/mongodb/rag_mongodb_llama3_huggingface_open_source.ipynb
+++ b/recipes/use_cases/chatbots/RAG_chatbot/vectorstore/mongodb/rag_mongodb_llama3_huggingface_open_source.ipynb
--- a/recipes/use_cases/chatbots/sales_bot/Musical_instruments_reviews.csv
+++ b/recipes/use_cases/chatbots/sales_bot/Musical_instruments_reviews.csv
--- a/recipes/use_cases/chatbots/sales_bot/SalesBot.ipynb
+++ b/recipes/use_cases/chatbots/sales_bot/SalesBot.ipynb
@@ -0,0 +1,668 @@
 
				+{
			
 
				+ "cells": [
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "374b67d0-b446-4d6f-8e07-59e97716c55a",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "# Sales Bot with Llama3 - A Summarization and RAG Use Case"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "add4953d-07c3-4480-ad91-7d0ea9c9fb55",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "## Overview\n",
			
 
				+    "\n",
			
 
				+    "In this notebook you'll take an Amazon product reviews dataset from Kaggle and use Llama3 to obtain product review summaries, upsert those summaries in a vector database, then use Retrieval Augmented Generation (RAG) to power a sales chatbot that can make targeted product recommendations.\n",
			
 
				+    "\n",
			
 
				+    "Let's take a look at the overall workflow:\n",
			
 
				+    "1. We start with a dataset that contains over 10,000 reviews across 900 Amazon musical instruments and accessories.\n",
			
 
				+    "2. Using Llama2 70B chat (hosted on OctoAI), we generate summaries of product reviews for each product from the 20 most recent reviews. We format the summaries in JSON format.\n",
			
 
				+    "3. We then take the summaries and upsert them into a vector database (Weaviate in this case)\n",
			
 
				+    "4. We then use this vector database and Llama3 70B instruct (hosted on OctoAI) to build a RAG-based sales chatbot that provides targeted recommendations to the user based on the products that are present in the inventory.\n",
			
 
				+    "\n",
			
 
				+    "Note: at the time of writing this tutorial, JSON mode formatting isn't supported for Llama 3 on OctoAI via constrained sampling which is why we are falling back onto Llama 2. This tutorial will be updated when the feature becomes available to rely on Llama 3 exclusively.\n",
			
 
				+    "\n",
			
 
				+    "### OctoAI\n",
			
 
				+    "We'll use [OctoAI](https://octo.ai/) to power all of the GenAI model needs of this notebook: LLMs, image gen, image animation.\n",
			
 
				+    "* To use OctoAI, you'll need to go to https://octoai.cloud/ and sign in using your Google or GitHub account.\n",
			
 
				+    "* Next you'll need to generate an OctoAI API token by following these [instructions](https://octo.ai/docs/getting-started/how-to-create-an-octoai-access-token). Keep the API token in hand, we'll need it further down in this notebook.\n",
			
 
				+    "\n",
			
 
				+    "In this example we will use the Llama 3 70b instruct model. You can find more on Llama models on the [OctoAI text generation solution page](https://octoai.cloud/text).\n",
			
 
				+    "\n",
			
 
				+    "At the time of writing this notebook the following Llama models are available on OctoAI:\n",
			
 
				+    "* meta-llama-3-8b-instruct\n",
			
 
				+    "* meta-llama-3-70b-instruct\n",
			
 
				+    "* codellama-7b-instruct\n",
			
 
				+    "* codellama-13b-instruct\n",
			
 
				+    "* codellama-34b-instruct\n",
			
 
				+    "* llama-2-13b-chat\n",
			
 
				+    "* llama-2-70b-chat\n",
			
 
				+    "* llamaguard-7b\n",
			
 
				+    "\n",
			
 
				+    "### Weaviate\n",
			
 
				+    "We'll use Weaviate Cloud Services (WCS) for our vector database. You can create an account and Weaviate clusters easily at the following link: https://console.weaviate.cloud/.\n",
			
 
				+    "You can then create a cluster, from which you can obtain the REST Endpoint URL and the API key to use the cluster endpoint.\n",
			
 
				+    "\n",
			
 
				+    "### OpenAI\n",
			
 
				+    "We'll be using OpenAI for its embedding model to upsert our vectors into the Weaviate vector database. Create an account and obtain an API key here: https://openai.com/api/\n",
			
 
				+    "\n",
			
 
				+    "### Local Python Notebook\n",
			
 
				+    "We highly recommend launching this notebook from a fresh python environment, for instance you can run the following:\n",
			
 
				+    "```\n",
			
 
				+    "python3 -m venv .venv         \n",
			
 
				+    "source .venv/bin/activate\n",
			
 
				+    "```\n",
			
 
				+    "All you need to run this notebook is to install jupyter notebook with `python3 -m pip install notebook` then run `jupyter notebook` ([link](https://jupyter.org/install)) in the same directory as this `.ipynb` file.\n",
			
 
				+    "You don't need to install additional pip packages ahead of running the notebook, since those will be installed right at the beginning. You will need to ensure your system has `imagemagick` installed by following the [instructions](https://imagemagick.org/script/download.php)."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "133c2ea4-0256-49cf-9f5a-a9e5bb0bb63f",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# Let's start by installing the appropriate python packages\n",
			
 
				+    "! pip install octoai===1.0.2 openai weaviate-client pandas gradio pydantic"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "75341227-43f8-4a68-b3cb-31e8216f874e",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "## Part 1: Review Summarization"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "793c06d7-fa67-4c67-a380-081ed3a7a7bf",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "Let's start by importing all of the packages we need for this example"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "edd366c8-4f0b-4211-83d3-c16e88cbd5c8",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "import gradio\n",
			
 
				+    "import json\n",
			
 
				+    "import langchain\n",
			
 
				+    "import os\n",
			
 
				+    "import openai\n",
			
 
				+    "import weaviate\n",
			
 
				+    "from getpass import getpass\n",
			
 
				+    "from json import loads\n",
			
 
				+    "from pandas import DataFrame, concat, read_csv\n",
			
 
				+    "from pydantic import BaseModel, Field\n",
			
 
				+    "from typing import List\n",
			
 
				+    "import weaviate.classes as wvc"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "cd171a7c-c5e7-46d5-8a04-a0f7863609be",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "Enter your OctoAI, Weaviate, and OpenAI tokens below"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "3af09686-a654-45b0-98c5-dee6f30440c7",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# Get OctoAI API token for Llama 2 & 3\n",
			
 
				+    "OCTOAI_API_TOKEN = getpass()\n",
			
 
				+    "os.environ[\"OCTOAI_API_TOKEN\"] = OCTOAI_API_TOKEN"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "31c3e684-6e5e-41ad-81d4-970b06522553",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# Get WCS API key\n",
			
 
				+    "WCS_API_KEY = getpass()\n",
			
 
				+    "os.environ[\"WCS_API_KEY\"] = WCS_API_KEY"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "a44f7b71-c4f9-4fd6-9a3b-1322c2fd0c35",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# Get WCS URL\n",
			
 
				+    "WCS_URL = getpass()\n",
			
 
				+    "os.environ[\"WCS_URL\"] = WCS_URL"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "e4502dfa-c369-4085-a697-fdcda00f970b",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# Get OpenAI API key for the embedding model\n",
			
 
				+    "OPENAI_API_KEY = getpass()\n",
			
 
				+    "os.environ[\"OPENAI_API_KEY\"] = OPENAI_API_KEY"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "883986ad-9f60-44d8-ab64-3f566261e055",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# First let's load the dataset from Kaggle: https://www.kaggle.com/datasets/eswarchandt/amazon-music-reviews\n",
			
 
				+    "df = read_csv('Musical_instruments_reviews.csv')"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "c05865a7-307a-425e-a6ee-f057d63db77b",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "Set `product_record_limit` to a lower number if you just want to do a test run"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "22f024e7-3976-425f-b684-8b2c2c1ed191",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# Set a product record limit\n",
			
 
				+    "product_record_limit = 900"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "06554f51-5983-42fc-8a8e-684ae82099db",
			
 
				+   "metadata": {
			
 
				+    "scrolled": true
			
 
				+   },
			
 
				+   "source": [
			
 
				+    "# List all of the unique ASIN:\n",
			
 
				+    "asin_list = df.asin.unique()\n",
			
 
				+    "print(\"There are {} unique products in the music product inventory\".format(len(asin_list)))"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "4941baa1-107b-4f39-8d04-1daa5acd465b",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "For each one of the unique products, let's group the reviews together and sort them by how recent they are"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "38147b91-2425-46a7-b6c0-221173d81024",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# Get the reviews for the product ASIN, sorted by recency and store in dict\n",
			
 
				+    "review_dict = {}\n",
			
 
				+    "for asin in asin_list[0:product_record_limit]:\n",
			
 
				+    "    reviews = df.loc[df['asin'] == asin]\\\n",
			
 
				+    "                .sort_values([\"unixReviewTime\"], axis=0, ascending=False)\\\n",
			
 
				+    "                .reviewText.tolist()\n",
			
 
				+    "    review_dict[asin] = reviews"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "7d5fb78d-808a-4753-abba-4a3066d76ba7",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "To be able to store our summaries into our vector DB, we need to have the fields formatted into a JSON object. We use Pydantic base class model here to define our formatting."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "b786cde1-116a-47eb-8478-3fa2285dcf9d",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# Define the Pydantic model that specifies how our output should be formatted\n",
			
 
				+    "class ProductRecord(BaseModel):\n",
			
 
				+    "    \"\"\"The record of a given product\"\"\"\n",
			
 
				+    "    description: str = Field(description=\"Description of the product\")\n",
			
 
				+    "    name: str = Field(description=\"Name of the product\")\n",
			
 
				+    "    review_summary: str = Field(description=\"Summary of all of the reviews\")\n",
			
 
				+    "    ASIN: str = Field(description=\"ASIN of the product\")\n",
			
 
				+    "    features: str = Field(description=\"Features of the product based on the reviews\")"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "08226a6e-f994-454b-9a1d-6246b34bfca2",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "We define our prompt template below."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "1cc3fe69-bf0c-4a50-8d9c-1ae6cb99a9ca",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# Prepare a prompt template\n",
			
 
				+    "template = '''\n",
			
 
				+    "Here are product reviews for a music product with an ID of {asin}.\n",
			
 
				+    " - Respond back only as only JSON!\n",
			
 
				+    " - Provide:\n",
			
 
				+    "     - the product \"description\",\n",
			
 
				+    "     - the product \"name\",\n",
			
 
				+    "     - a summary of all the reviews as \"review_summary\",\n",
			
 
				+    "     - the \"ASIN\" and\n",
			
 
				+    "     - and the product \"features\" based on the content of these reviews. \n",
			
 
				+    " - The \"features\" should be a string describing the features and NOT JSON. \n",
			
 
				+    " - Do not include the ASIN in the description field.\n",
			
 
				+    " \n",
			
 
				+    "The reviews for the product are: {reviews}\n",
			
 
				+    "'''"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "9b8dc3fa-4ad9-4329-96a0-353b05a1c43e",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "We initialize the OctoAI client using OpenAI's API. All we have to do is override the `base_url` and `api_key`."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "57c2ff0a-8029-41a6-a06f-41e560b92230",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# Init OctoAI client\n",
			
 
				+    "client = openai.OpenAI(\n",
			
 
				+    "    base_url=\"https://text.octoai.run/v1\",\n",
			
 
				+    "    api_key=os.environ[\"OCTOAI_API_TOKEN\"]\n",
			
 
				+    ")"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "bd0eb425-ceea-4258-a52d-814b7335febb",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "Iterate over all product ASINs and summarize the top 20 most recent reviews. Note: this takes a while to run unless we parallelize it."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "1a55839e-a824-4919-b755-730eaac48d83",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# Produce the 900 product summaries\n",
			
 
				+    "review_summaries = []\n",
			
 
				+    "counter = 0\n",
			
 
				+    "\n",
			
 
				+    "# This can take a while to process serially (30min+)\n",
			
 
				+    "# TODO: Optimize to run in a few parallel threads to run faster while meeting the 240RPM limit\n",
			
 
				+    "for asin, review_list in review_dict.items():\n",
			
 
				+    "    print(f'Getting review summary {counter} of {len(review_dict)}, ASIN: {asin}')\n",
			
 
				+    "    try:\n",
			
 
				+    "        response = client.chat.completions.create(\n",
			
 
				+    "            model=\"llama-2-70b-chat\",\n",
			
 
				+    "            messages=[\n",
			
 
				+    "                {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
			
 
				+    "                {\"role\": \"user\", \"content\": template.format(\n",
			
 
				+    "                    asin = asin,\n",
			
 
				+    "                    reviews = review_list[0:20]\n",
			
 
				+    "                )},\n",
			
 
				+    "            ],\n",
			
 
				+    "            temperature=0,\n",
			
 
				+    "            response_format={\"type\": \"json_object\", \"schema\": ProductRecord.model_json_schema()},\n",
			
 
				+    "            max_tokens=1024\n",
			
 
				+    "        )\n",
			
 
				+    "        print(\"\\n{}\\n\".format(response.choices[0].message.content))\n",
			
 
				+    "        summary = loads(response.choices[0].message.content)\n",
			
 
				+    "        summary[\"ASIN\"] = asin\n",
			
 
				+    "        review_summaries.append(summary)\n",
			
 
				+    "    except:\n",
			
 
				+    "        print(f'Issue with ASIN {asin}, skipping')\n",
			
 
				+    "        pass\n",
			
 
				+    "    counter += 1\n",
			
 
				+    "\n",
			
 
				+    "review_summaries = DataFrame(review_summaries)\n",
			
 
				+    "\n",
			
 
				+    "print(review_summaries)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "4772d1c1-c9c4-466e-9c80-259804a4286b",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "# Part 2: Retrieval Augmented Generation"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "ccd97408-d47f-46f3-b601-f66f8a3b20ff",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "For our RAG use case we're going to rely on Weaviate vector database and on an OpenAI embedding model. \n",
			
 
				+    "\n",
			
 
				+    "When you define your collection, you'll need to provide properties, i.e. object attributes that you want to store in the collection. These properties map 1:1 to the JSON dictionary keys defined earlier for the `ProductRecord` Pydantic base model."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "5dad98ec-531d-4fc2-aed9-9f337b957feb",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# Connect to WCS\n",
			
 
				+    "wcs_client = weaviate.connect_to_wcs(\n",
			
 
				+    "    cluster_url=os.getenv(\"WCS_URL\"),\n",
			
 
				+    "    auth_credentials=weaviate.auth.AuthApiKey(os.getenv(\"WCS_API_KEY\")),\n",
			
 
				+    "    headers={\n",
			
 
				+    "        \"X-OpenAI-Api-Key\": os.environ[\"OPENAI_API_KEY\"]\n",
			
 
				+    "    }\n",
			
 
				+    ")"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "02953f7b-0149-4c13-a7cc-c4dd1da45d43",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# Create the collection if it doesn't already exist\n",
			
 
				+    "try:\n",
			
 
				+    "    collection = wcs_client.collections.get(\"Products\")\n",
			
 
				+    "except:\n",
			
 
				+    "    # Create the collection for products\n",
			
 
				+    "    collection = wcs_client.collections.create(\n",
			
 
				+    "        name=\"Products\",\n",
			
 
				+    "        vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_openai(),\n",
			
 
				+    "        properties=[\n",
			
 
				+    "            wvc.config.Property(\n",
			
 
				+    "                name=\"ASIN\",\n",
			
 
				+    "                data_type=wvc.config.DataType.TEXT\n",
			
 
				+    "            ),\n",
			
 
				+    "             wvc.config.Property(\n",
			
 
				+    "                name=\"name\",\n",
			
 
				+    "                data_type=wvc.config.DataType.TEXT\n",
			
 
				+    "            ),\n",
			
 
				+    "             wvc.config.Property(\n",
			
 
				+    "                name=\"review_summary\",\n",
			
 
				+    "                data_type=wvc.config.DataType.TEXT\n",
			
 
				+    "            ),\n",
			
 
				+    "             wvc.config.Property(\n",
			
 
				+    "                name=\"features\",\n",
			
 
				+    "                data_type=wvc.config.DataType.TEXT\n",
			
 
				+    "            ),\n",
			
 
				+    "             wvc.config.Property(\n",
			
 
				+    "                name=\"description\",\n",
			
 
				+    "                data_type=wvc.config.DataType.TEXT\n",
			
 
				+    "            ),\n",
			
 
				+    "        ]\n",
			
 
				+    "    )\n",
			
 
				+    "    print(\"Collection Created!\")"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "1551fd74-b143-4c02-9b56-364d33683fd3",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "Now we upsert all of the vectors into the databse using OpenAI's embedding model."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "53f779e7-b875-4a19-9f9c-74b45992608e",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# Convert df to JSON string and then to a list of dictionaries\n",
			
 
				+    "data = review_summaries.to_json(orient='records')\n",
			
 
				+    "data_list = json.loads(data)\n",
			
 
				+    "\n",
			
 
				+    "items_to_insert = []\n",
			
 
				+    "\n",
			
 
				+    "for d in data_list:\n",
			
 
				+    "    new_item = {\n",
			
 
				+    "        \"ASIN\": d[\"ASIN\"],\n",
			
 
				+    "        \"name\": d[\"name\"],\n",
			
 
				+    "        \"description\": d[\"description\"],  \\\n",
			
 
				+    "        \"features\": d[\"features\"],\n",
			
 
				+    "        \"review_summary\": d[\"review_summary\"]\n",
			
 
				+    "    }\n",
			
 
				+    "    items_to_insert.append(new_item)\n",
			
 
				+    "\n",
			
 
				+    "    # Insert every 100 items\n",
			
 
				+    "    if len(items_to_insert) == 100:\n",
			
 
				+    "        collection.data.insert_many(items_to_insert)\n",
			
 
				+    "        items_to_insert.clear()\n",
			
 
				+    "\n",
			
 
				+    "# Insert remaining items\n",
			
 
				+    "if len(items_to_insert) > 0:\n",
			
 
				+    "    collection.data.insert_many(items_to_insert)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "35079318-41a5-46fc-8475-5d728550fb88",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "Let's now try to run a hybrid search on the following query below.\n",
			
 
				+    "Hybrid search combines the results of a vector search and a keyword (BM25F) search by fusing the two result sets.\n",
			
 
				+    "It will return the 3 closest entries in the database according to the search criteria."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "5f707954-c36b-4a83-874b-f817bd33c39a",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# Hybrid search\n",
			
 
				+    "response = collection.query.hybrid(\n",
			
 
				+    "    query=\"easy to learn instrument\",\n",
			
 
				+    "    limit=3\n",
			
 
				+    ")\n",
			
 
				+    "for o in response.objects:\n",
			
 
				+    "    print(o.properties)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "04d39507-5e8e-4374-a33c-53e57db6ef99",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "Let's now define a helper function that gives us the relevant context given a string query. Let's see what it returns based on the question: \"What is a good beginner harmonica\""
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "a1ca51c7-83e5-4896-acc9-753060592ba0",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# Helper function to run hybrid search on a user query and return the closest\n",
			
 
				+    "# product review summaries relevant to the user query\n",
			
 
				+    "def get_context(question, limit=3):\n",
			
 
				+    "    response = collection.query.hybrid(\n",
			
 
				+    "        query=question,\n",
			
 
				+    "        limit=limit\n",
			
 
				+    "    )\n",
			
 
				+    "    return \"\\n\".join([str(o.properties) for o in response.objects])\n",
			
 
				+    "\n",
			
 
				+    "print(get_context(\"What is a good beginner harmonica\"))"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "677f534c-8be4-4b6b-82d9-2df8e2ad12d4",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "Great, we're now ready to build a sales assistant helper function.\n",
			
 
				+    "\n",
			
 
				+    "We first define a prompt template for Llama 3 - based on the context provided by the vector hybrid search (i.e. collection of product summaries of relevance to the question), provide a helpful recommendation to the customer. \n",
			
 
				+    "\n",
			
 
				+    "Also provide links to the product that the user can click on to view the product on Amazon's website. For that we use the fact that any product referenced by its aSIN can be accessed at the following url: `https://www.amazon.com/exec/obidos/ASIN/<insert aSIN here>`"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "856d021a-add5-48f4-a09c-258d2a617095",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "sales_template = \"\"\"\n",
			
 
				+    "You are a sales assistant. Answer the user questions as helpfully as possible.\n",
			
 
				+    "Only recommend the products that are provided in the context provided below.\n",
			
 
				+    "\n",
			
 
				+    "Provide a reference to each product you mention with hyperlinks:\n",
			
 
				+    "* Provide the name of the product\n",
			
 
				+    "* Embed the hyperlink in the name of the product as follows\n",
			
 
				+    "    * If the product name is \"Solid Electric Guitar Case with Accessories Compartment\"\n",
			
 
				+    "    * And the aSIN is \"B001EL6I8W\"\n",
			
 
				+    "    * Format the reference as follows: \n",
			
 
				+    "         [Solid Electric Guitar Case with Accessories Compartment](https://www.amazon.com/exec/obidos/ASIN/B001EL6I8W)\n",
			
 
				+    "\n",
			
 
				+    "Finish with a references section.\n",
			
 
				+    "\n",
			
 
				+    "Customer question: {}\n",
			
 
				+    "\n",
			
 
				+    "Product context: {}\n",
			
 
				+    "\n",
			
 
				+    "AI:\n",
			
 
				+    "\"\"\"\n",
			
 
				+    "\n",
			
 
				+    "def sales_assistant(question):  \n",
			
 
				+    "    response = client.chat.completions.create(\n",
			
 
				+    "                model=\"meta-llama-3-70b-instruct\",\n",
			
 
				+    "                messages=[\n",
			
 
				+    "                    {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
			
 
				+    "                    {\"role\": \"user\", \"content\": sales_template.format(question, get_context(question, limit=10))},\n",
			
 
				+    "                ],\n",
			
 
				+    "                temperature=0,\n",
			
 
				+    "                max_tokens=1024\n",
			
 
				+    "            )\n",
			
 
				+    "    \n",
			
 
				+    "    return response.choices[0].message.content\n",
			
 
				+    "\n",
			
 
				+    "print(sales_assistant(\"what is must have accessory for my new electric guitar\"))"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "faccba14-9216-4420-b6c5-ddf4029d7904",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "# Part 3: Gradio-based sales assistant demo"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "3e2b73b5-6bdf-4c87-b044-2690fd52605f",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "In this section we build a simple an interactive sales bot assistant using Gradio."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "53805acb-3e8d-40fa-8045-c589cb14eadd",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "import gradio as gr\n",
			
 
				+    "\n",
			
 
				+    "def predict(message, history):\n",
			
 
				+    "    history_openai_format = []\n",
			
 
				+    "    for human, assistant in history:\n",
			
 
				+    "        history_openai_format.append({\"role\": \"user\", \"content\": human})\n",
			
 
				+    "        history_openai_format.append({\"role\": \"assistant\", \"content\": assistant})\n",
			
 
				+    "    history_openai_format.append({\"role\": \"user\", \"content\": sales_template.format(message, get_context(message, limit=5))})\n",
			
 
				+    "\n",
			
 
				+    "    response = client.chat.completions.create(\n",
			
 
				+    "        model = 'meta-llama-3-70b-instruct',\n",
			
 
				+    "        messages = history_openai_format,\n",
			
 
				+    "        temperature = 0.0,\n",
			
 
				+    "        stream = True\n",
			
 
				+    "     )\n",
			
 
				+    "\n",
			
 
				+    "    partial_message = \"\"\n",
			
 
				+    "    for chunk in response:\n",
			
 
				+    "        if chunk.choices[0].delta.content is not None:\n",
			
 
				+    "              partial_message = partial_message + chunk.choices[0].delta.content\n",
			
 
				+    "              yield partial_message\n",
			
 
				+    "\n",
			
 
				+    "gr.ChatInterface(predict).launch()"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "6d4e65fe-0246-40b7-adb6-9091cccbc486",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "**Authors**\n",
			
 
				+    "- Thierry Moreau, OctoAI - tmoreau@octo.ai\n",
			
 
				+    "- Jonathan Tuite, Weaviate - jon@weaviate.io"
			
 
				+   ]
			
 
				+  }
			
 
				+ ],
			
 
				+ "metadata": {
			
 
				+  "kernelspec": {
			
 
				+   "display_name": "Python 3 (ipykernel)",
			
 
				+   "language": "python",
			
 
				+   "name": "python3"
			
 
				+  },
			
 
				+  "language_info": {
			
 
				+   "codemirror_mode": {
			
 
				+    "name": "ipython",
			
 
				+    "version": 3
			
 
				+   },
			
 
				+   "file_extension": ".py",
			
 
				+   "mimetype": "text/x-python",
			
 
				+   "name": "python",
			
 
				+   "nbconvert_exporter": "python",
			
 
				+   "pygments_lexer": "ipython3",
			
 
				+   "version": "3.11.6"
			
 
				+  }
			
 
				+ },
			
 
				+ "nbformat": 4,
			
 
				+ "nbformat_minor": 5
			
 
				+}
			
--- a/recipes/use_cases/llamaindex_cookbook.ipynb
+++ b/recipes/use_cases/llamaindex_cookbook.ipynb
--- a/requirements.txt
+++ b/requirements.txt
@@ -18,3 +18,4 @@ gradio
 
				 chardet
			
 
				 openai
			
 
				 typing-extensions==4.8.0
			
 
				+tabulate
			
--- a/src/llama_recipes/configs/peft.py
+++ b/src/llama_recipes/configs/peft.py
@@ -20,7 +20,8 @@ class llama_adapter_config:
 
				      adapter_layers: int= 30
			
 
				      task_type: str= "CAUSAL_LM"
			
 
				 
			
 
				+#CAUTION prefix tuning is currently not supported
			
 
				 @dataclass
			
 
				 class prefix_config:
			
 
				      num_virtual_tokens: int=30
			
 
				-     task_type: str= "CAUSAL_LM"    
			
 
				+     task_type: str= "CAUSAL_LM"
			
--- a/src/llama_recipes/configs/training.py
+++ b/src/llama_recipes/configs/training.py
@@ -6,7 +6,7 @@ from dataclasses import dataclass
 
				 
			
 
				 @dataclass
			
 
				 class train_config:
			
 
				-    model_name: str="PATH/to/LLAMA/7B"
			
 
				+    model_name: str="PATH/to/Model"
			
 
				     tokenizer_name: str=None
			
 
				     enable_fsdp: bool=False
			
 
				     low_cpu_fsdp: bool=False
			
@@ -29,8 +29,9 @@ class train_config:
 
				     mixed_precision: bool=True
			
 
				     val_batch_size: int=1
			
 
				     dataset = "samsum_dataset"
			
 
				-    peft_method: str = "lora" # None , llama_adapter, prefix
			
 
				+    peft_method: str = "lora" # None, llama_adapter (Caution: llama_adapter is currently not supported with FSDP)
			
 
				     use_peft: bool=False
			
 
				+    from_peft_checkpoint: str="" # if not empty and use_peft=True, will load the peft checkpoint and resume the fine-tuning on that checkpoint
			
 
				     output_dir: str = "PATH/to/save/PEFT/model"
			
 
				     freeze_layers: bool = False
			
 
				     num_freeze_layers: int = 1
			
@@ -43,3 +44,7 @@ class train_config:
 
				     use_fast_kernels: bool = False # Enable using SDPA from PyTroch Accelerated Transformers, make use Flash Attention and Xformer memory-efficient kernels
			
 
				     use_wandb: bool = False # Enable wandb for experient tracking
			
 
				     save_metrics: bool = False # saves training metrics to a json file for later plotting
			
 
				+    flop_counter: bool = False # Enable flop counter to measure model throughput, can not be used with pytorch profiler at the same time.
			
 
				+    flop_counter_start: int = 3 # The step to start profiling, default is 3, which means after 3 steps of warmup stage, the profiler will start to count flops.
			
 
				+    use_profiler: bool = False # Enable pytorch profiler, can not be used with flop counter at the same time.
			
 
				+    profiler_dir: str = "PATH/to/save/profiler/results" # will be used if using profiler
			
--- a/src/llama_recipes/datasets/alpaca_dataset.py
+++ b/src/llama_recipes/datasets/alpaca_dataset.py
@@ -26,10 +26,12 @@ PROMPT_DICT = {
 
				 class InstructionDataset(Dataset):
			
 
				     def __init__(self, dataset_config, tokenizer, partition="train"):
			
 
				         self.ann = json.load(open(dataset_config.data_path))
			
 
				+        # Use 5% of the dataset for evaluation
			
 
				+        eval_length = int(len(self.ann)/20)
			
 
				         if partition == "train":
			
 
				-            self.ann = self.ann[200:]
			
 
				+            self.ann = self.ann[eval_length:]
			
 
				         else:
			
 
				-            self.ann = self.ann[:200]
			
 
				+            self.ann = self.ann[:eval_length]
			
 
				 
			
 
				         self.tokenizer = tokenizer
			
 
				 
			
--- a/src/llama_recipes/finetuning.py
+++ b/src/llama_recipes/finetuning.py
@@ -8,7 +8,7 @@ import fire
 
				 import random
			
 
				 import torch
			
 
				 import torch.optim as optim
			
 
				-from peft import get_peft_model, prepare_model_for_kbit_training
			
 
				+from peft import get_peft_model, prepare_model_for_kbit_training, PeftModel
			
 
				 from torch.distributed.fsdp import (
			
 
				     FullyShardedDataParallel as FSDP,
			
 
				     ShardingStrategy
			
@@ -134,7 +134,7 @@ def main(**kwargs):
 
				     tokenizer = AutoTokenizer.from_pretrained(train_config.model_name if train_config.tokenizer_name is None else train_config.tokenizer_name)
			
 
				     tokenizer.pad_token_id = tokenizer.eos_token_id
			
 
				 
			
 
				-    # If there is a mismatch between tokenizer vocab size and embedding matrix, 
			
 
				+    # If there is a mismatch between tokenizer vocab size and embedding matrix,
			
 
				     # throw a warning and then expand the embedding matrix
			
 
				     if len(tokenizer) > model.get_input_embeddings().weight.shape[0]:
			
 
				         print("WARNING: Resizing the embedding matrix to match the tokenizer vocab size.")
			
@@ -151,11 +151,17 @@ def main(**kwargs):
 
				         model.to(torch.bfloat16)
			
 
				 
			
 
				     if train_config.use_peft:
			
 
				-        peft_config = generate_peft_config(train_config, kwargs)
			
 
				-        model = get_peft_model(model, peft_config)
			
 
				-        model.print_trainable_parameters()
			
 
				+        # Load the pre-trained peft model checkpoint and setup its configuration
			
 
				+        if train_config.from_peft_checkpoint:
			
 
				+            model = PeftModel.from_pretrained(model, train_config.from_peft_checkpoint, is_trainable=True)
			
 
				+            peft_config = model.peft_config()
			
 
				+        # Generate the peft config and start fine-tuning from original model
			
 
				+        else:
			
 
				+            peft_config = generate_peft_config(train_config, kwargs)
			
 
				+            model = get_peft_model(model, peft_config)
			
 
				         if wandb_run:
			
 
				             wandb_run.config.update(peft_config)
			
 
				+        model.print_trainable_parameters()
			
 
				 
			
 
				 
			
 
				     hsdp_device_mesh = None
			
@@ -166,8 +172,7 @@ def main(**kwargs):
 
				     #setting up FSDP if enable_fsdp is enabled
			
 
				     if train_config.enable_fsdp:
			
 
				         if not train_config.use_peft and train_config.freeze_layers:
			
 
				-
			
 
				-            freeze_transformer_layers(train_config.num_freeze_layers)
			
 
				+            freeze_transformer_layers(model, train_config.num_freeze_layers)
			
 
				 
			
 
				         mixed_precision_policy, wrapping_policy = get_policies(fsdp_config, rank)
			
 
				         my_auto_wrapping_policy = fsdp_auto_wrap_policy(model, LlamaDecoderLayer)
			
@@ -188,7 +193,7 @@ def main(**kwargs):
 
				             device_id=device_id,
			
 
				             limit_all_gathers=True,
			
 
				             sync_module_states=train_config.low_cpu_fsdp,
			
 
				-            param_init_fn=lambda module: module.to_empty(device=torch.device("cuda"), recurse=False)
			
 
				+            param_init_fn=(lambda module: module.to_empty(device=torch.device("cuda"), recurse=False))
			
 
				             if train_config.low_cpu_fsdp and rank != 0 else None,
			
 
				         )
			
 
				         if fsdp_config.fsdp_activation_checkpointing:
			
@@ -217,7 +222,7 @@ def main(**kwargs):
 
				         split="test",
			
 
				     )
			
 
				     if not train_config.enable_fsdp or rank == 0:
			
 
				-            print(f"--> Validation Set Length = {len(dataset_val)}")
			
 
				+        print(f"--> Validation Set Length = {len(dataset_val)}")
			
 
				 
			
 
				     if train_config.batching_strategy == "packing":
			
 
				         dataset_train = ConcatDataset(dataset_train, chunk_size=train_config.context_length)
			
@@ -245,6 +250,10 @@ def main(**kwargs):
 
				             pin_memory=True,
			
 
				             **val_dl_kwargs,
			
 
				         )
			
 
				+        if len(eval_dataloader) == 0:
			
 
				+            raise ValueError("The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set.")
			
 
				+        else:
			
 
				+            print(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}")
			
 
				 
			
 
				     # Initialize the optimizer and learning rate scheduler
			
 
				     if fsdp_config.pure_bf16 and fsdp_config.optimizer == "anyprecision":
			
--- a/src/llama_recipes/inference/checkpoint_converter_fsdp_hf.py
+++ b/src/llama_recipes/inference/checkpoint_converter_fsdp_hf.py
@@ -8,7 +8,7 @@ import os
 
				 import sys
			
 
				 import yaml
			
 
				 
			
 
				-from transformers import LlamaTokenizer
			
 
				+from transformers import AutoTokenizer
			
 
				 
			
 
				 from llama_recipes.inference.model_utils import  load_llama_from_config
			
 
				 
			
@@ -56,7 +56,7 @@ def main(
 
				     model = load_sharded_model_single_gpu(model_def, fsdp_checkpoint_path)
			
 
				     print("model is loaded from FSDP checkpoints")
			
 
				     #loading the tokenizer form the  model_path
			
 
				-    tokenizer = LlamaTokenizer.from_pretrained(HF_model_path_or_name)
			
 
				+    tokenizer = AutoTokenizer.from_pretrained(HF_model_path_or_name)
			
 
				     tokenizer.save_pretrained(consolidated_model_path)
			
 
				     #save the FSDP sharded checkpoints in HF format
			
 
				     model.save_pretrained(consolidated_model_path)
			
--- a/src/llama_recipes/inference/llm.py
+++ b/src/llama_recipes/inference/llm.py
@@ -187,6 +187,7 @@ class OctoAI(LLM):
 
				     @override
			
 
				     def valid_models(self) -> list[str]:
			
 
				         return [
			
 
				-            "llamaguard-7b",
			
 
				-            "llama-2-13b-chat",
			
 
				+            "llamaguard-2-8b",
			
 
				+            "meta-llama-3-8b-instruct",
			
 
				+            "meta-llama-3-70b-instruct",        
			
 
				         ]
			
--- a/src/llama_recipes/utils/config_utils.py
+++ b/src/llama_recipes/utils/config_utils.py
@@ -34,7 +34,7 @@ def update_config(config, **kwargs):
 
				                     if hasattr(config, param_name):
			
 
				                         setattr(config, param_name, v)
			
 
				                     else:
			
 
				-                        # In case of specialized config we can warm user
			
 
				+                        # In case of specialized config we can warn user
			
 
				                         print(f"Warning: {config_name} does not accept parameter: {k}")
			
 
				             elif isinstance(config, train_config):
			
 
				                 print(f"Warning: unknown parameter {k}")
			
@@ -45,7 +45,14 @@ def generate_peft_config(train_config, kwargs):
 
				     peft_configs = (LoraConfig, AdaptionPromptConfig, PrefixTuningConfig)
			
 
				     names = tuple(c.__name__.rstrip("_config") for c in configs)
			
 
				 
			
 
				-    assert train_config.peft_method in names, f"Peft config not found: {train_config.peft_method}"
			
 
				+    if train_config.peft_method not in names:
			
 
				+        raise RuntimeError(f"Peft config not found: {train_config.peft_method}")
			
 
				+
			
 
				+    if train_config.peft_method == "prefix":
			
 
				+        raise RuntimeError("PrefixTuning is currently not supported (see https://github.com/meta-llama/llama-recipes/issues/359#issuecomment-2089350811)")
			
 
				+
			
 
				+    if train_config.enable_fsdp and train_config.peft_method == "llama_adapter":
			
 
				+        raise RuntimeError("Llama_adapter is currently not supported in combination with FSDP (see https://github.com/meta-llama/llama-recipes/issues/359#issuecomment-2089274425)")
			
 
				 
			
 
				     config = configs[names.index(train_config.peft_method)]()
			
 
				 
			
--- a/src/llama_recipes/utils/flop_utils.py
+++ b/src/llama_recipes/utils/flop_utils.py
@@ -0,0 +1,87 @@
 
				+from typing import Any, Dict, List, Optional, Union
			
 
				+import time
			
 
				+import torch
			
 
				+from torch.utils.flop_counter import FlopCounterMode
			
 
				+
			
 
				+
			
 
				+class FlopMeasure(FlopCounterMode):
			
 
				+    """
			
 
				+    ``FlopMeasure`` is a customized context manager that counts the number of
			
 
				+    flops within its context. It is based on ``FlopCounterMode`` with additional start_counting() and stop_counting() function so that the flop counting
			
 
				+    will only start after the warmup stage.
			
 
				+    It also supports hierarchical output by passing a module (or list of modules) to FlopCounterMode on construction.
			
 
				+
			
 
				+    Example usage
			
 
				+
			
 
				+    .. code-block:: python
			
 
				+
			
 
				+        model = ...
			
 
				+        flop_counter = FlopMeasure(model,local_rank=0,warmup_step=3)
			
 
				+        for batch in enumerate(dataloader):
			
 
				+            with flop_counter:
			
 
				+                model(batch)
			
 
				+                flop_counter.step()
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        mods: Optional[Union[torch.nn.Module, List[torch.nn.Module]]] = None,
			
 
				+        depth: int = 2,
			
 
				+        display: bool = True,
			
 
				+        custom_mapping: Dict[Any, Any] = None,
			
 
				+        rank=None,
			
 
				+        warmup_step: int = 3,
			
 
				+    ):
			
 
				+        super().__init__(mods, depth, display, custom_mapping)
			
 
				+        self.rank = rank
			
 
				+        self.warmup_step = warmup_step
			
 
				+        self.start_time = 0
			
 
				+        self.end_time = 0
			
 
				+
			
 
				+    def step(self):
			
 
				+        # decrease the warmup step by 1 for every step, so that the flop counting will start when warmup_step =0. Stop decreasing when warm_up reaches -1.
			
 
				+        if self.warmup_step >= 0:
			
 
				+            self.warmup_step -= 1
			
 
				+        if self.warmup_step == 0 and self.start_time == 0:
			
 
				+            self.start_time = time.time()
			
 
				+        elif self.warmup_step == -1 and self.start_time != 0 and self.end_time == 0:
			
 
				+            self.end_time = time.time()
			
 
				+    def __enter__(self):
			
 
				+        if self.warmup_step == 0:
			
 
				+            self.start_time = time.time()
			
 
				+        super().__enter__()
			
 
				+        return self
			
 
				+    def is_done(self):
			
 
				+        return self.warmup_step == -1
			
 
				+    def get_total_flops(self):
			
 
				+        return super().get_total_flops()
			
 
				+    def get_flops_per_sec(self):
			
 
				+        if self.start_time == 0 or self.end_time == 0:
			
 
				+            print("Warning: flop count did not finish correctly")
			
 
				+            return 0
			
 
				+        return super().get_total_flops()/ (self.end_time - self.start_time)
			
 
				+    def get_table(self, depth=2):
			
 
				+        return super().get_table(depth)
			
 
				+
			
 
				+    def __exit__(self, *args):
			
 
				+        if self.get_total_flops() == 0:
			
 
				+            print(
			
 
				+                "Warning: did not record any flops this time. Skipping the flop report"
			
 
				+            )
			
 
				+        else:
			
 
				+            if self.display:
			
 
				+                if self.rank is None or self.rank == 0:
			
 
				+                    print("Total time used in this flop counting step is: {}".format(self.end_time - self.start_time))
			
 
				+                    print("The total TFlop per second is: {}".format(self.get_flops_per_sec() / 1e12))
			
 
				+                    print("The tflop_count table is below:")
			
 
				+                    print(self.get_table(self.depth))
			
 
				+            # Disable the display feature so that we don't print the table again
			
 
				+            self.display = False
			
 
				+        super().__exit__(*args)
			
 
				+
			
 
				+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
			
 
				+        # when warmup_step is 0, count the flops and return the original output
			
 
				+        if self.warmup_step == 0:
			
 
				+            return super().__torch_dispatch__(func, types, args, kwargs)
			
 
				+        # otherwise, just return the original output
			
 
				+        return func(*args, **kwargs)
			
--- a/src/llama_recipes/utils/fsdp_utils.py
+++ b/src/llama_recipes/utils/fsdp_utils.py
@@ -8,8 +8,6 @@ def fsdp_auto_wrap_policy(model, transformer_layer_name):
 
				 
			
 
				     from torch.distributed.fsdp.wrap import _or_policy, lambda_auto_wrap_policy, transformer_auto_wrap_policy
			
 
				 
			
 
				-    from peft.tuners import PrefixEncoder, PromptEmbedding, PromptEncoder
			
 
				-
			
 
				     def lambda_policy_fn(module):
			
 
				         if (
			
 
				             len(list(module.named_children())) == 0
			
@@ -23,13 +21,7 @@ def fsdp_auto_wrap_policy(model, transformer_layer_name):
 
				     transformer_wrap_policy = functools.partial(
			
 
				         transformer_auto_wrap_policy,
			
 
				         transformer_layer_cls=(
			
 
				-            PrefixEncoder,
			
 
				-            PromptEncoder,
			
 
				-            PromptEmbedding,
			
 
				             transformer_layer_name,
			
 
				-            # FullyShardedDataParallelPlugin.get_module_class_from_name(
			
 
				-            #     model, transformer_layer_name
			
 
				-            # ),
			
 
				         ),
			
 
				     )
			
 
				 
			
--- a/src/llama_recipes/utils/train_utils.py
+++ b/src/llama_recipes/utils/train_utils.py
@@ -6,8 +6,8 @@ import time
 
				 import yaml
			
 
				 from contextlib import nullcontext
			
 
				 from pathlib import Path
			
 
				-from pkg_resources import packaging
			
 
				 from datetime import datetime
			
 
				+import contextlib
			
 
				 
			
 
				 
			
 
				 import torch
			
@@ -24,14 +24,48 @@ from llama_recipes.model_checkpointing import save_model_checkpoint, save_model_
 
				 from llama_recipes.policies import fpSixteen,bfSixteen, get_llama_wrapper
			
 
				 from llama_recipes.utils.memory_utils import MemoryTrace
			
 
				 from accelerate.utils import is_xpu_available, is_ccl_available
			
 
				-
			
 
				+from llama_recipes.utils.flop_utils import FlopMeasure
			
 
				 def set_tokenizer_params(tokenizer: LlamaTokenizer):
			
 
				     tokenizer.pad_token_id = 0
			
 
				     tokenizer.padding_side = "left"
			
 
				 
			
 
				-# Converting Bytes to Megabytes
			
 
				-def byte2mb(x):
			
 
				-    return int(x / 2**20)
			
 
				+@contextlib.contextmanager
			
 
				+def profile(cfg, local_rank=None):
			
 
				+    use_profiler: bool = cfg.use_profiler
			
 
				+    use_flop_counter: bool = cfg.flop_counter
			
 
				+    if use_flop_counter and use_profiler:
			
 
				+        raise ValueError("Cannot use both profiler and flop counter")
			
 
				+    if use_profiler:
			
 
				+        # profiler needs a warmup stage to get the accurate profiling results
			
 
				+        wait_step, warmup_step, active_step = 1, 2, 3
			
 
				+        min_step = wait_step + warmup_step + active_step + 1
			
 
				+        if cfg.max_train_step > 0 and cfg.max_train_step < min_step:
			
 
				+            raise ValueError(f"pytorch profiler requires at least {min_step} train steps to finish the warm-up and recording stage, {wait_step} for wait_step, {warmup_step} for warmup_step, {active_step} for profiling step, please increase the max_train_step, current max_train_step {cfg.max_train_step}")
			
 
				+        print(f"pytorch profiling is activated and results will be saved in {cfg.profiler_dir}")
			
 
				+        with torch.profiler.profile(
			
 
				+            activities=[
			
 
				+                torch.profiler.ProfilerActivity.CPU,
			
 
				+                torch.profiler.ProfilerActivity.CUDA,
			
 
				+            ],
			
 
				+            schedule=torch.profiler.schedule(wait=wait_step, warmup=warmup_step, active=active_step, repeat=1),
			
 
				+            on_trace_ready=torch.profiler.tensorboard_trace_handler(
			
 
				+                cfg.profiler_dir
			
 
				+            ),
			
 
				+            profile_memory=True,
			
 
				+            with_stack=False,
			
 
				+            with_flops=True,
			
 
				+            record_shapes=True,
			
 
				+        ) as torch_profiler:
			
 
				+            yield torch_profiler
			
 
				+    elif use_flop_counter:
			
 
				+        if cfg.max_train_step > 0 and cfg.max_train_step <= cfg.flop_counter_start:
			
 
				+            raise ValueError(f"flop counter requires at least {cfg.flop_counter_start + 1} train steps, please increase the max_train_step, current max_train_step {cfg.max_train_step}")
			
 
				+        with FlopMeasure(rank=local_rank,warmup_step=cfg.flop_counter_start) as flop_counter:
			
 
				+            yield flop_counter
			
 
				+    else:
			
 
				+        torch_profiler = contextlib.nullcontext()
			
 
				+        yield None
			
 
				+
			
 
				 
			
 
				 def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_scheduler, gradient_accumulation_steps, train_config, fsdp_config=None, local_rank=None, rank=None, wandb_run=None):
			
 
				     """
			
@@ -62,13 +96,14 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche
 
				 
			
 
				 
			
 
				     autocast = torch.cuda.amp.autocast if train_config.use_fp16 else nullcontext
			
 
				-
			
 
				     train_prep = []
			
 
				     train_loss = []
			
 
				     val_prep = []
			
 
				     val_loss =[]
			
 
				 
			
 
				     if train_config.save_metrics:
			
 
				+        if not os.path.exists(train_config.output_dir):
			
 
				+            os.makedirs(train_config.output_dir, exist_ok=True)
			
 
				         metrics_filename = f"{train_config.output_dir}/metrics_data_{local_rank}-{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json"
			
 
				         train_step_perplexity = []
			
 
				         train_step_loss = []
			
@@ -92,73 +127,77 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche
 
				             total_loss = 0.0
			
 
				             total_length = len(train_dataloader)//gradient_accumulation_steps
			
 
				             pbar = tqdm(colour="blue", desc=f"Training Epoch: {epoch+1}", total=total_length, dynamic_ncols=True)
			
 
				-            for step, batch in enumerate(train_dataloader):
			
 
				-                total_train_steps += 1
			
 
				-                # stop when the maximum number of training steps is reached
			
 
				-                if train_config.max_train_step > 0 and total_train_steps > train_config.max_train_step:
			
 
				-                    max_steps_reached = True
			
 
				-                    if not train_config.enable_fsdp or local_rank==0:
			
 
				-                        print("max training steps reached, stopping training, total_train_steps: ", total_train_steps-1)
			
 
				-                    break
			
 
				-                for key in batch.keys():
			
 
				-                    if train_config.enable_fsdp:
			
 
				-                        if is_xpu_available():
			
 
				-                            batch[key] = batch[key].to(torch.device(f"xpu:{local_rank}"))
			
 
				+            with profile(train_config,local_rank) as profile_context:
			
 
				+                for step, batch in enumerate(train_dataloader):
			
 
				+                    total_train_steps += 1
			
 
				+                    # stop when the maximum number of training steps is reached
			
 
				+                    if train_config.max_train_step > 0 and total_train_steps > train_config.max_train_step:
			
 
				+                        max_steps_reached = True
			
 
				+                        if not train_config.enable_fsdp or local_rank==0:
			
 
				+                            print("max training steps reached, stopping training, total train steps finished: ", total_train_steps-1)
			
 
				+                        break
			
 
				+                    for key in batch.keys():
			
 
				+                        if train_config.enable_fsdp:
			
 
				+                            if is_xpu_available():
			
 
				+                                batch[key] = batch[key].to(torch.device(f"xpu:{local_rank}"))
			
 
				+                            else:
			
 
				+                                batch[key] = batch[key].to(local_rank)
			
 
				                         else:
			
 
				-                            batch[key] = batch[key].to(local_rank)
			
 
				-                    else:
			
 
				 
			
 
				-                        if is_xpu_available():
			
 
				-                            batch[key] = batch[key].to('xpu:0')
			
 
				-                        else:
			
 
				-                            batch[key] = batch[key].to('cuda:0')
			
 
				-                with autocast():
			
 
				-                    loss = model(**batch).loss
			
 
				-                loss = loss / gradient_accumulation_steps
			
 
				-                if train_config.save_metrics:
			
 
				-                    train_step_loss.append(loss.detach().float().item())
			
 
				-                    train_step_perplexity.append(float(torch.exp(loss.detach().float())))
			
 
				-                total_loss += loss.detach().float()
			
 
				-                if train_config.use_fp16:
			
 
				-                    # if fp16 is enabled, use gradient scaler to handle gradient update
			
 
				-                    scaler.scale(loss).backward()
			
 
				-                    if (step + 1) % gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
			
 
				-                        if train_config.gradient_clipping and train_config.gradient_clipping_threshold > 0.0:
			
 
				-                            scaler.unscale_(optimizer)
			
 
				-                            if train_config.enable_fsdp:
			
 
				-                                model.clip_grad_norm_(train_config.gradient_clipping_threshold)
			
 
				+                            if is_xpu_available():
			
 
				+                                batch[key] = batch[key].to('xpu:0')
			
 
				                             else:
			
 
				-                                torch.nn.utils.clip_grad_norm_(model.parameters(), train_config.gradient_clipping_threshold)
			
 
				-                        scaler.step(optimizer)
			
 
				-                        scaler.update()
			
 
				-                        optimizer.zero_grad()
			
 
				-                        pbar.update(1)
			
 
				-                else:
			
 
				-                    # regular backpropagation when fp16 is not used
			
 
				-                    loss.backward()
			
 
				-                    if (step + 1) % gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
			
 
				-                        if train_config.gradient_clipping and train_config.gradient_clipping_threshold > 0.0:
			
 
				-                            if train_config.enable_fsdp:
			
 
				-                                model.clip_grad_norm_(train_config.gradient_clipping_threshold)
			
 
				-                            else:
			
 
				-                                torch.nn.utils.clip_grad_norm_(model.parameters(), train_config.gradient_clipping_threshold)
			
 
				-                        optimizer.step()
			
 
				-                        optimizer.zero_grad()
			
 
				-                        pbar.update(1)
			
 
				-
			
 
				-                if wandb_run:
			
 
				-                    if not train_config.enable_fsdp or rank==0:
			
 
				-                        wandb_run.log({
			
 
				-                            'train/epoch': epoch + 1,
			
 
				-                            'train/step': epoch * len(train_dataloader) + step,
			
 
				-                            'train/loss': loss.detach().float(),
			
 
				-                        })
			
 
				-
			
 
				-                pbar.set_description(f"Training Epoch: {epoch+1}/{train_config.num_epochs}, step {step}/{len(train_dataloader)} completed (loss: {loss.detach().float()})")
			
 
				-
			
 
				-                if train_config.save_metrics:
			
 
				-                    save_to_json(metrics_filename, train_step_loss, train_loss, train_step_perplexity, train_prep, val_step_loss, val_loss, val_step_perplexity, val_prep)
			
 
				-            pbar.close()
			
 
				+                                batch[key] = batch[key].to('cuda:0')
			
 
				+                    with autocast():
			
 
				+                        loss = model(**batch).loss
			
 
				+                    loss = loss / gradient_accumulation_steps
			
 
				+                    if train_config.save_metrics:
			
 
				+                        train_step_loss.append(loss.detach().float().item())
			
 
				+                        train_step_perplexity.append(float(torch.exp(loss.detach().float())))
			
 
				+                    total_loss += loss.detach().float()
			
 
				+                    if train_config.use_fp16:
			
 
				+                        # if fp16 is enabled, use gradient scaler to handle gradient update
			
 
				+                        scaler.scale(loss).backward()
			
 
				+                        if (step + 1) % gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
			
 
				+                            if train_config.gradient_clipping and train_config.gradient_clipping_threshold > 0.0:
			
 
				+                                scaler.unscale_(optimizer)
			
 
				+                                if train_config.enable_fsdp:
			
 
				+                                    model.clip_grad_norm_(train_config.gradient_clipping_threshold)
			
 
				+                                else:
			
 
				+                                    torch.nn.utils.clip_grad_norm_(model.parameters(), train_config.gradient_clipping_threshold)
			
 
				+                            scaler.step(optimizer)
			
 
				+                            scaler.update()
			
 
				+                            optimizer.zero_grad()
			
 
				+                            pbar.update(1)
			
 
				+                    else:
			
 
				+                        # regular backpropagation when fp16 is not used
			
 
				+                        loss.backward()
			
 
				+                        if (step + 1) % gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
			
 
				+                            if train_config.gradient_clipping and train_config.gradient_clipping_threshold > 0.0:
			
 
				+                                if train_config.enable_fsdp:
			
 
				+                                    model.clip_grad_norm_(train_config.gradient_clipping_threshold)
			
 
				+                                else:
			
 
				+                                    torch.nn.utils.clip_grad_norm_(model.parameters(), train_config.gradient_clipping_threshold)
			
 
				+                            optimizer.step()
			
 
				+                            optimizer.zero_grad()
			
 
				+                            pbar.update(1)
			
 
				+                    if train_config.use_profiler or train_config.flop_counter:
			
 
				+                        profile_context.step()
			
 
				+                    if train_config.flop_counter and profile_context.is_done():
			
 
				+                        TFlops = profile_context.get_flops_per_sec() / 1e12
			
 
				+                    if wandb_run:
			
 
				+                        if not train_config.enable_fsdp or rank==0:
			
 
				+                            wandb_run.log({
			
 
				+                                'train/epoch': epoch + 1,
			
 
				+                                'train/step': epoch * len(train_dataloader) + step,
			
 
				+                                'train/loss': loss.detach().float(),
			
 
				+                            })
			
 
				+
			
 
				+                    pbar.set_description(f"Training Epoch: {epoch+1}/{train_config.num_epochs}, step {step}/{len(train_dataloader)} completed (loss: {loss.detach().float()})")
			
 
				+
			
 
				+                    if train_config.save_metrics:
			
 
				+                        save_to_json(metrics_filename, train_step_loss, train_loss, train_step_perplexity, train_prep, val_step_loss, val_loss, val_step_perplexity, val_prep)
			
 
				+                pbar.close()
			
 
				 
			
 
				         epoch_end_time = time.perf_counter()-epoch_start_time
			
 
				         epoch_times.append(epoch_end_time)
			
@@ -180,7 +219,6 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche
 
				 
			
 
				         # Update the learning rate as needed
			
 
				         lr_scheduler.step()
			
 
				-
			
 
				         if train_config.run_validation:
			
 
				             eval_ppl, eval_epoch_loss, temp_val_loss, temp_step_perplexity = evaluation(model, train_config, eval_dataloader, local_rank, tokenizer, wandb_run)
			
 
				             if train_config.save_metrics:
			
@@ -266,7 +304,8 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche
 
				     results["avg_checkpoint_time"] = avg_checkpoint_time
			
 
				     if train_config.save_metrics:
			
 
				         results["metrics_filename"] = metrics_filename
			
 
				-
			
 
				+    if train_config.flop_counter:
			
 
				+        results["model_tflops"]= TFlops
			
 
				     #saving the training params including fsdp setting for reference.
			
 
				     if train_config.enable_fsdp and not train_config.use_peft and rank==0:
			
 
				         save_train_params(train_config, fsdp_config, rank)
			
@@ -434,7 +473,7 @@ def get_policies(cfg, rank):
 
				     verify_bfloat_support = ((
			
 
				     torch.version.cuda
			
 
				     and torch.cuda.is_bf16_supported()
			
 
				-    and packaging.version.parse(torch.version.cuda).release >= (11, 0)
			
 
				+    and torch.version.cuda >= "11.0"
			
 
				     and dist.is_nccl_available()
			
 
				     and nccl.version() >= (2, 10)
			
 
				     ) or
			
--- a/src/tests/conftest.py
+++ b/src/tests/conftest.py
--- a/tests/datasets/test_custom_dataset.py
+++ b/tests/datasets/test_custom_dataset.py
@@ -33,6 +33,7 @@ def check_padded_entry(batch, tokenizer):
 
				     assert batch["input_ids"][0][-1] == tokenizer.eos_token_id
			
 
				 
			
 
				 
			
 
				+@pytest.mark.skip(reason="Flakey due to random dataset order @todo fix order")
			
 
				 @pytest.mark.skip_missing_tokenizer
			
 
				 @patch('llama_recipes.finetuning.train')
			
 
				 @patch('llama_recipes.finetuning.AutoTokenizer')
			
@@ -45,6 +46,7 @@ def test_custom_dataset(step_lr, optimizer, get_model, tokenizer, train, mocker,
 
				     setup_tokenizer(tokenizer)
			
 
				 
			
 
				     skip_special_tokens = llama_version == "meta-llama/Llama-2-7b-hf"
			
 
				+    get_model.return_value.get_input_embeddings.return_value.weight.shape = [32000 if "Llama-2" in llama_version else 128256]
			
 
				 
			
 
				     kwargs = {
			
 
				         "dataset": "custom_dataset",
			
@@ -98,10 +100,11 @@ def test_custom_dataset(step_lr, optimizer, get_model, tokenizer, train, mocker,
 
				 @patch('llama_recipes.finetuning.AutoTokenizer.from_pretrained')
			
 
				 @patch('llama_recipes.finetuning.optim.AdamW')
			
 
				 @patch('llama_recipes.finetuning.StepLR')
			
 
				-def test_unknown_dataset_error(step_lr, optimizer, tokenizer, get_model, train, mocker):
			
 
				+def test_unknown_dataset_error(step_lr, optimizer, tokenizer, get_model, train, mocker, llama_version):
			
 
				     from llama_recipes.finetuning import main
			
 
				 
			
 
				     tokenizer.return_value = mocker.MagicMock(side_effect=lambda x: {"input_ids":[len(x)*[0,]], "attention_mask": [len(x)*[0,]]})
			
 
				+    get_model.return_value.get_input_embeddings.return_value.weight.shape = [32000 if "Llama-2" in llama_version else 128256]
			
 
				 
			
 
				     kwargs = {
			
 
				         "dataset": "custom_dataset",
			
--- a/tests/datasets/test_grammar_datasets.py
+++ b/tests/datasets/test_grammar_datasets.py
@@ -26,6 +26,7 @@ def test_grammar_dataset(step_lr, optimizer, get_model, tokenizer, train, setup_
 
				     from llama_recipes.finetuning import main
			
 
				 
			
 
				     setup_tokenizer(tokenizer)
			
 
				+    get_model.return_value.get_input_embeddings.return_value.weight.shape = [32000 if "Llama-2" in llama_version else 128256]
			
 
				 
			
 
				     BATCH_SIZE = 8
			
 
				     kwargs = {
			
--- a/tests/datasets/test_samsum_datasets.py
+++ b/tests/datasets/test_samsum_datasets.py
@@ -26,6 +26,7 @@ def test_samsum_dataset(step_lr, optimizer, get_model, tokenizer, train, mocker,
 
				     from llama_recipes.finetuning import main
			
 
				 
			
 
				     setup_tokenizer(tokenizer)
			
 
				+    get_model.return_value.get_input_embeddings.return_value.weight.shape = [32000 if "Llama-2" in llama_version else 128256]
			
 
				 
			
 
				     BATCH_SIZE = 8
			
 
				     kwargs = {
			
--- a/tests/test_batching.py
+++ b/tests/test_batching.py
@@ -25,7 +25,8 @@ def test_packing(step_lr, optimizer, get_model, tokenizer, train, setup_tokenize
 
				     from llama_recipes.finetuning import main
			
 
				 
			
 
				     setup_tokenizer(tokenizer)
			
 
				-
			
 
				+    get_model.return_value.get_input_embeddings.return_value.weight.shape = [32000 if "Llama-2" in llama_version else 128256]
			
 
				+    
			
 
				     kwargs = {
			
 
				         "model_name": llama_version,
			
 
				         "batch_size_training": 8,
			
@@ -72,6 +73,7 @@ def test_distributed_packing(dist, is_initialized, fsdp, setup, step_lr, optimiz
 
				     from llama_recipes.finetuning import main
			
 
				 
			
 
				     setup_tokenizer(tokenizer)
			
 
				+    get_model.return_value.get_input_embeddings.return_value.weight.shape = [32000 if "Llama-2" in llama_version else 128256]
			
 
				 
			
 
				     rank = 1
			
 
				     os.environ['LOCAL_RANK'] = f'{rank}'
			
--- a/tests/test_chat_completion.py
+++ b/tests/test_chat_completion.py
@@ -7,7 +7,7 @@ import pytest
 
				 import torch
			
 
				 from llama_recipes.inference.chat_utils import read_dialogs_from_file
			
 
				 
			
 
				-ROOT_DIR = Path(__file__).parents[1]
			
 
				+ROOT_DIR = Path(__file__).parents[2]
			
 
				 CHAT_COMPLETION_DIR = ROOT_DIR / "recipes/inference/local_inference/chat_completion/"
			
 
				 
			
 
				 sys.path = [CHAT_COMPLETION_DIR.as_posix()] + sys.path
			
@@ -107,6 +107,7 @@ def test_chat_completion(
 
				     from chat_completion import main
			
 
				 
			
 
				     setup_tokenizer(tokenizer)
			
 
				+    load_model.return_value.get_input_embeddings.return_value.weight.shape = [32000 if "Llama-2" in llama_version else 128256]
			
 
				 
			
 
				     kwargs = {
			
 
				         "prompt_file": (CHAT_COMPLETION_DIR / "chats.json").as_posix(),
			
--- a/src/tests/test_finetuning.py
+++ b/src/tests/test_finetuning.py
@@ -0,0 +1,264 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				+# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
			
 
				+
			
 
				+import os
			
 
				+from unittest.mock import patch
			
 
				+
			
 
				+import pytest
			
 
				+
			
 
				+import torch
			
 
				+from llama_recipes.data.sampler import LengthBasedBatchSampler
			
 
				+
			
 
				+from llama_recipes.finetuning import main
			
 
				+from pytest import approx
			
 
				+from torch.optim import AdamW
			
 
				+from torch.utils.data.dataloader import DataLoader
			
 
				+from torch.utils.data.sampler import BatchSampler
			
 
				+
			
 
				+
			
 
				+def get_fake_dataset():
			
 
				+    return [
			
 
				+        {
			
 
				+            "input_ids": [1],
			
 
				+            "attention_mask": [1],
			
 
				+            "labels": [1],
			
 
				+        }
			
 
				+    ]
			
 
				+
			
 
				+
			
 
				+@patch("llama_recipes.finetuning.torch.cuda.is_available")
			
 
				+@patch("llama_recipes.finetuning.train")
			
 
				+@patch("llama_recipes.finetuning.LlamaForCausalLM.from_pretrained")
			
 
				+@patch("llama_recipes.finetuning.AutoTokenizer.from_pretrained")
			
 
				+@patch("llama_recipes.finetuning.get_preprocessed_dataset")
			
 
				+@patch("llama_recipes.finetuning.optim.AdamW")
			
 
				+@patch("llama_recipes.finetuning.StepLR")
			
 
				+@pytest.mark.parametrize("cuda_is_available", [True, False])
			
 
				+def test_finetuning_no_validation(
			
 
				+    step_lr,
			
 
				+    optimizer,
			
 
				+    get_dataset,
			
 
				+    tokenizer,
			
 
				+    get_model,
			
 
				+    train,
			
 
				+    cuda,
			
 
				+    cuda_is_available,
			
 
				+):
			
 
				+    kwargs = {"run_validation": False}
			
 
				+
			
 
				+    get_dataset.return_value = get_fake_dataset()
			
 
				+    cuda.return_value = cuda_is_available
			
 
				+
			
 
				+    get_model.return_value.get_input_embeddings.return_value.weight.shape = [0]
			
 
				+
			
 
				+    main(**kwargs)
			
 
				+
			
 
				+    assert train.call_count == 1
			
 
				+
			
 
				+    args, kwargs = train.call_args
			
 
				+    train_dataloader = args[1]
			
 
				+    eval_dataloader = args[2]
			
 
				+
			
 
				+    assert isinstance(train_dataloader, DataLoader)
			
 
				+    assert eval_dataloader is None
			
 
				+
			
 
				+    if cuda_is_available:
			
 
				+        assert get_model.return_value.to.call_count == 1
			
 
				+        assert get_model.return_value.to.call_args.args[0] == "cuda"
			
 
				+    else:
			
 
				+        assert get_model.return_value.to.call_count == 0
			
 
				+
			
 
				+
			
 
				+@patch("llama_recipes.finetuning.torch.cuda.is_available")
			
 
				+@patch("llama_recipes.finetuning.train")
			
 
				+@patch("llama_recipes.finetuning.LlamaForCausalLM.from_pretrained")
			
 
				+@patch("llama_recipes.finetuning.AutoTokenizer.from_pretrained")
			
 
				+@patch("llama_recipes.finetuning.get_preprocessed_dataset")
			
 
				+@patch("llama_recipes.finetuning.optim.AdamW")
			
 
				+@patch("llama_recipes.finetuning.StepLR")
			
 
				+@pytest.mark.parametrize("cuda_is_available", [True, False])
			
 
				+def test_finetuning_with_validation(
			
 
				+    step_lr,
			
 
				+    optimizer,
			
 
				+    get_dataset,
			
 
				+    tokenizer,
			
 
				+    get_model,
			
 
				+    train,
			
 
				+    cuda,
			
 
				+    cuda_is_available,
			
 
				+):
			
 
				+    kwargs = {"run_validation": True}
			
 
				+
			
 
				+    get_dataset.return_value = get_fake_dataset()
			
 
				+    cuda.return_value = cuda_is_available
			
 
				+
			
 
				+    get_model.return_value.get_input_embeddings.return_value.weight.shape = [0]
			
 
				+
			
 
				+    main(**kwargs)
			
 
				+
			
 
				+    assert train.call_count == 1
			
 
				+
			
 
				+    args, kwargs = train.call_args
			
 
				+    train_dataloader = args[1]
			
 
				+    eval_dataloader = args[2]
			
 
				+    assert isinstance(train_dataloader, DataLoader)
			
 
				+    assert isinstance(eval_dataloader, DataLoader)
			
 
				+
			
 
				+    if cuda_is_available:
			
 
				+        assert get_model.return_value.to.call_count == 1
			
 
				+        assert get_model.return_value.to.call_args.args[0] == "cuda"
			
 
				+    else:
			
 
				+        assert get_model.return_value.to.call_count == 0
			
 
				+
			
 
				+
			
 
				+@patch("llama_recipes.finetuning.torch.cuda.is_available")
			
 
				+@patch("llama_recipes.finetuning.train")
			
 
				+@patch("llama_recipes.finetuning.LlamaForCausalLM.from_pretrained")
			
 
				+@patch("llama_recipes.finetuning.AutoTokenizer.from_pretrained")
			
 
				+@patch("llama_recipes.finetuning.get_preprocessed_dataset")
			
 
				+@patch("llama_recipes.finetuning.generate_peft_config")
			
 
				+@patch("llama_recipes.finetuning.get_peft_model")
			
 
				+@patch("llama_recipes.finetuning.optim.AdamW")
			
 
				+@patch("llama_recipes.finetuning.StepLR")
			
 
				+@pytest.mark.parametrize("cuda_is_available", [True, False])
			
 
				+def test_finetuning_peft_lora(
			
 
				+    step_lr,
			
 
				+    optimizer,
			
 
				+    get_peft_model,
			
 
				+    gen_peft_config,
			
 
				+    get_dataset,
			
 
				+    tokenizer,
			
 
				+    get_model,
			
 
				+    train,
			
 
				+    cuda,
			
 
				+    cuda_is_available,
			
 
				+):
			
 
				+    kwargs = {"use_peft": True}
			
 
				+
			
 
				+    get_dataset.return_value = get_fake_dataset()
			
 
				+    cuda.return_value = cuda_is_available
			
 
				+
			
 
				+    get_model.return_value.get_input_embeddings.return_value.weight.shape = [0]
			
 
				+
			
 
				+    main(**kwargs)
			
 
				+
			
 
				+    if cuda_is_available:
			
 
				+        assert get_peft_model.return_value.to.call_count == 1
			
 
				+        assert get_peft_model.return_value.to.call_args.args[0] == "cuda"
			
 
				+    else:
			
 
				+        assert get_peft_model.return_value.to.call_count == 0
			
 
				+
			
 
				+    assert get_peft_model.return_value.print_trainable_parameters.call_count == 1
			
 
				+
			
 
				+
			
 
				+@patch("llama_recipes.finetuning.get_peft_model")
			
 
				+@patch("llama_recipes.finetuning.setup")
			
 
				+@patch("llama_recipes.finetuning.train")
			
 
				+@patch("llama_recipes.finetuning.LlamaForCausalLM.from_pretrained")
			
 
				+@patch("llama_recipes.finetuning.AutoTokenizer.from_pretrained")
			
 
				+@patch("llama_recipes.finetuning.get_preprocessed_dataset")
			
 
				+def test_finetuning_peft_llama_adapter(
			
 
				+    get_dataset, tokenizer, get_model, train, setup, get_peft_model
			
 
				+):
			
 
				+    kwargs = {
			
 
				+        "use_peft": True,
			
 
				+        "peft_method": "llama_adapter",
			
 
				+        "enable_fsdp": True,
			
 
				+    }
			
 
				+
			
 
				+    get_dataset.return_value = get_fake_dataset()
			
 
				+
			
 
				+    get_model.return_value.get_input_embeddings.return_value.weight.shape = [0]
			
 
				+
			
 
				+    os.environ["RANK"] = "0"
			
 
				+    os.environ["LOCAL_RANK"] = "0"
			
 
				+    os.environ["WORLD_SIZE"] = "1"
			
 
				+    os.environ["MASTER_ADDR"] = "localhost"
			
 
				+    os.environ["MASTER_PORT"] = "12345"
			
 
				+
			
 
				+    with pytest.raises(
			
 
				+        RuntimeError,
			
 
				+        match="Llama_adapter is currently not supported in combination with FSDP",
			
 
				+    ):
			
 
				+        main(**kwargs)
			
 
				+
			
 
				+    GET_ME_OUT = "Get me out of here"
			
 
				+    get_peft_model.side_effect = RuntimeError(GET_ME_OUT)
			
 
				+
			
 
				+    kwargs["enable_fsdp"] = False
			
 
				+
			
 
				+    with pytest.raises(
			
 
				+        RuntimeError,
			
 
				+        match=GET_ME_OUT,
			
 
				+    ):
			
 
				+        main(**kwargs)
			
 
				+
			
 
				+
			
 
				+@patch("llama_recipes.finetuning.train")
			
 
				+@patch("llama_recipes.finetuning.LlamaForCausalLM.from_pretrained")
			
 
				+@patch("llama_recipes.finetuning.AutoTokenizer.from_pretrained")
			
 
				+@patch("llama_recipes.finetuning.get_preprocessed_dataset")
			
 
				+@patch("llama_recipes.finetuning.get_peft_model")
			
 
				+@patch("llama_recipes.finetuning.StepLR")
			
 
				+def test_finetuning_weight_decay(
			
 
				+    step_lr, get_peft_model, get_dataset, tokenizer, get_model, train
			
 
				+):
			
 
				+    kwargs = {"weight_decay": 0.01}
			
 
				+
			
 
				+    get_dataset.return_value = get_fake_dataset()
			
 
				+
			
 
				+    get_model.return_value.parameters.return_value = [torch.ones(1, 1)]
			
 
				+    get_model.return_value.get_input_embeddings.return_value.weight.shape = [0]
			
 
				+
			
 
				+    main(**kwargs)
			
 
				+
			
 
				+    assert train.call_count == 1
			
 
				+
			
 
				+    args, kwargs = train.call_args
			
 
				+    optimizer = args[4]
			
 
				+
			
 
				+    print(optimizer.state_dict())
			
 
				+
			
 
				+    assert isinstance(optimizer, AdamW)
			
 
				+    assert optimizer.state_dict()["param_groups"][0]["weight_decay"] == approx(0.01)
			
 
				+
			
 
				+
			
 
				+@patch("llama_recipes.finetuning.train")
			
 
				+@patch("llama_recipes.finetuning.LlamaForCausalLM.from_pretrained")
			
 
				+@patch("llama_recipes.finetuning.AutoTokenizer.from_pretrained")
			
 
				+@patch("llama_recipes.finetuning.get_preprocessed_dataset")
			
 
				+@patch("llama_recipes.finetuning.optim.AdamW")
			
 
				+@patch("llama_recipes.finetuning.StepLR")
			
 
				+def test_batching_strategy(
			
 
				+    step_lr, optimizer, get_dataset, tokenizer, get_model, train
			
 
				+):
			
 
				+    kwargs = {"batching_strategy": "packing"}
			
 
				+
			
 
				+    get_dataset.return_value = get_fake_dataset()
			
 
				+
			
 
				+    get_model.return_value.get_input_embeddings.return_value.weight.shape = [0]
			
 
				+
			
 
				+    main(**kwargs)
			
 
				+
			
 
				+    assert train.call_count == 1
			
 
				+
			
 
				+    args, kwargs = train.call_args
			
 
				+    train_dataloader, eval_dataloader = args[1:3]
			
 
				+    assert isinstance(train_dataloader.batch_sampler, BatchSampler)
			
 
				+    assert isinstance(eval_dataloader.batch_sampler, BatchSampler)
			
 
				+
			
 
				+    kwargs["batching_strategy"] = "padding"
			
 
				+    train.reset_mock()
			
 
				+    main(**kwargs)
			
 
				+
			
 
				+    assert train.call_count == 1
			
 
				+
			
 
				+    args, kwargs = train.call_args
			
 
				+    train_dataloader, eval_dataloader = args[1:3]
			
 
				+    assert isinstance(train_dataloader.batch_sampler, LengthBasedBatchSampler)
			
 
				+    assert isinstance(eval_dataloader.batch_sampler, LengthBasedBatchSampler)
			
 
				+
			
 
				+    kwargs["batching_strategy"] = "none"
			
 
				+
			
 
				+    with pytest.raises(ValueError):
			
 
				+        main(**kwargs)
			
--- a/src/tests/test_finetuning_data_formatter.py
+++ b/src/tests/test_finetuning_data_formatter.py
--- a/src/tests/test_sampler.py
+++ b/src/tests/test_sampler.py
--- a/tests/test_train_utils.py
+++ b/tests/test_train_utils.py
@@ -103,6 +103,7 @@ def test_save_to_json(temp_output_dir, mocker):
 
				     train_config.max_train_step = 0
			
 
				     train_config.max_eval_step = 0
			
 
				     train_config.output_dir = temp_output_dir
			
 
				+    train_config.use_profiler = False
			
 
				 
			
 
				     results = train(
			
 
				         model,
			
--- a/tests/test_finetuning.py
+++ b/tests/test_finetuning.py
@@ -1,176 +0,0 @@
 
				-# Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				-# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
			
 
				-
			
 
				-import pytest
			
 
				-from pytest import approx
			
 
				-from unittest.mock import patch
			
 
				-
			
 
				-import torch
			
 
				-from torch.optim import AdamW
			
 
				-from torch.utils.data.dataloader import DataLoader
			
 
				-from torch.utils.data.sampler import BatchSampler
			
 
				-
			
 
				-from llama_recipes.finetuning import main
			
 
				-from llama_recipes.data.sampler import LengthBasedBatchSampler
			
 
				-
			
 
				-
			
 
				-def get_fake_dataset():
			
 
				-    return [{
			
 
				-        "input_ids":[1],
			
 
				-        "attention_mask":[1],
			
 
				-        "labels":[1],
			
 
				-        }]
			
 
				-
			
 
				-@patch('llama_recipes.finetuning.torch.cuda.is_available')
			
 
				-@patch('llama_recipes.finetuning.train')
			
 
				-@patch('llama_recipes.finetuning.LlamaForCausalLM.from_pretrained')
			
 
				-@patch('llama_recipes.finetuning.AutoTokenizer.from_pretrained')
			
 
				-@patch('llama_recipes.finetuning.get_preprocessed_dataset')
			
 
				-@patch('llama_recipes.finetuning.optim.AdamW')
			
 
				-@patch('llama_recipes.finetuning.StepLR')
			
 
				-@pytest.mark.parametrize("cuda_is_available", [True, False])
			
 
				-def test_finetuning_no_validation(step_lr, optimizer, get_dataset, tokenizer, get_model, train, cuda, cuda_is_available):
			
 
				-    kwargs = {"run_validation": False}
			
 
				-
			
 
				-    get_dataset.return_value = get_fake_dataset()
			
 
				-    cuda.return_value = cuda_is_available
			
 
				-
			
 
				-    main(**kwargs)
			
 
				-
			
 
				-    assert train.call_count == 1
			
 
				-
			
 
				-    args, kwargs = train.call_args
			
 
				-    train_dataloader = args[1]
			
 
				-    eval_dataloader = args[2]
			
 
				-
			
 
				-    assert isinstance(train_dataloader, DataLoader)
			
 
				-    assert eval_dataloader is None
			
 
				-
			
 
				-    if cuda_is_available:
			
 
				-        assert get_model.return_value.to.call_count == 1
			
 
				-        assert get_model.return_value.to.call_args.args[0] == "cuda"
			
 
				-    else:
			
 
				-        assert get_model.return_value.to.call_count == 0
			
 
				-
			
 
				-
			
 
				-@patch('llama_recipes.finetuning.torch.cuda.is_available')
			
 
				-@patch('llama_recipes.finetuning.train')
			
 
				-@patch('llama_recipes.finetuning.LlamaForCausalLM.from_pretrained')
			
 
				-@patch('llama_recipes.finetuning.AutoTokenizer.from_pretrained')
			
 
				-@patch('llama_recipes.finetuning.get_preprocessed_dataset')
			
 
				-@patch('llama_recipes.finetuning.optim.AdamW')
			
 
				-@patch('llama_recipes.finetuning.StepLR')
			
 
				-@pytest.mark.parametrize("cuda_is_available", [True, False])
			
 
				-def test_finetuning_with_validation(step_lr, optimizer, get_dataset, tokenizer, get_model, train, cuda, cuda_is_available):
			
 
				-    kwargs = {"run_validation": True}
			
 
				-
			
 
				-    get_dataset.return_value = get_fake_dataset()
			
 
				-    cuda.return_value = cuda_is_available
			
 
				-
			
 
				-    main(**kwargs)
			
 
				-
			
 
				-    assert train.call_count == 1
			
 
				-
			
 
				-    args, kwargs = train.call_args
			
 
				-    train_dataloader = args[1]
			
 
				-    eval_dataloader = args[2]
			
 
				-    assert isinstance(train_dataloader, DataLoader)
			
 
				-    assert isinstance(eval_dataloader, DataLoader)
			
 
				-
			
 
				-    if cuda_is_available:
			
 
				-        assert get_model.return_value.to.call_count == 1
			
 
				-        assert get_model.return_value.to.call_args.args[0] == "cuda"
			
 
				-    else:
			
 
				-        assert get_model.return_value.to.call_count == 0
			
 
				-
			
 
				-@patch('llama_recipes.finetuning.torch.cuda.is_available')
			
 
				-@patch('llama_recipes.finetuning.train')
			
 
				-@patch('llama_recipes.finetuning.LlamaForCausalLM.from_pretrained')
			
 
				-@patch('llama_recipes.finetuning.AutoTokenizer.from_pretrained')
			
 
				-@patch('llama_recipes.finetuning.get_preprocessed_dataset')
			
 
				-@patch('llama_recipes.finetuning.generate_peft_config')
			
 
				-@patch('llama_recipes.finetuning.get_peft_model')
			
 
				-@patch('llama_recipes.finetuning.optim.AdamW')
			
 
				-@patch('llama_recipes.finetuning.StepLR')
			
 
				-@pytest.mark.parametrize("cuda_is_available", [True, False])
			
 
				-def test_finetuning_peft(step_lr, optimizer, get_peft_model, gen_peft_config, get_dataset, tokenizer, get_model, train, cuda, cuda_is_available):
			
 
				-    kwargs = {"use_peft": True}
			
 
				-
			
 
				-    get_dataset.return_value = get_fake_dataset()
			
 
				-    cuda.return_value = cuda_is_available
			
 
				-
			
 
				-    main(**kwargs)
			
 
				-
			
 
				-    if cuda_is_available:
			
 
				-        assert get_peft_model.return_value.to.call_count == 1
			
 
				-        assert get_peft_model.return_value.to.call_args.args[0] == "cuda"
			
 
				-    else:
			
 
				-        assert get_peft_model.return_value.to.call_count == 0
			
 
				-
			
 
				-    assert get_peft_model.return_value.print_trainable_parameters.call_count == 1
			
 
				-
			
 
				-
			
 
				-@patch('llama_recipes.finetuning.train')
			
 
				-@patch('llama_recipes.finetuning.LlamaForCausalLM.from_pretrained')
			
 
				-@patch('llama_recipes.finetuning.AutoTokenizer.from_pretrained')
			
 
				-@patch('llama_recipes.finetuning.get_preprocessed_dataset')
			
 
				-@patch('llama_recipes.finetuning.get_peft_model')
			
 
				-@patch('llama_recipes.finetuning.StepLR')
			
 
				-def test_finetuning_weight_decay(step_lr, get_peft_model, get_dataset, tokenizer, get_model, train, mocker):
			
 
				-    kwargs = {"weight_decay": 0.01}
			
 
				-
			
 
				-    get_dataset.return_value = get_fake_dataset()
			
 
				-
			
 
				-    model = mocker.MagicMock(name="Model")
			
 
				-    model.parameters.return_value = [torch.ones(1,1)]
			
 
				-
			
 
				-    get_model.return_value = model
			
 
				-
			
 
				-    main(**kwargs)
			
 
				-
			
 
				-    assert train.call_count == 1
			
 
				-
			
 
				-    args, kwargs = train.call_args
			
 
				-    optimizer = args[4]
			
 
				-
			
 
				-    print(optimizer.state_dict())
			
 
				-
			
 
				-    assert isinstance(optimizer, AdamW)
			
 
				-    assert optimizer.state_dict()["param_groups"][0]["weight_decay"] == approx(0.01)
			
 
				-
			
 
				-
			
 
				-@patch('llama_recipes.finetuning.train')
			
 
				-@patch('llama_recipes.finetuning.LlamaForCausalLM.from_pretrained')
			
 
				-@patch('llama_recipes.finetuning.AutoTokenizer.from_pretrained')
			
 
				-@patch('llama_recipes.finetuning.get_preprocessed_dataset')
			
 
				-@patch('llama_recipes.finetuning.optim.AdamW')
			
 
				-@patch('llama_recipes.finetuning.StepLR')
			
 
				-def test_batching_strategy(step_lr, optimizer, get_dataset, tokenizer, get_model, train):
			
 
				-    kwargs = {"batching_strategy": "packing"}
			
 
				-
			
 
				-    get_dataset.return_value = get_fake_dataset()
			
 
				-
			
 
				-    main(**kwargs)
			
 
				-
			
 
				-    assert train.call_count == 1
			
 
				-
			
 
				-    args, kwargs = train.call_args
			
 
				-    train_dataloader, eval_dataloader = args[1:3]
			
 
				-    assert isinstance(train_dataloader.batch_sampler, BatchSampler)
			
 
				-    assert isinstance(eval_dataloader.batch_sampler, BatchSampler)
			
 
				-
			
 
				-    kwargs["batching_strategy"] = "padding"
			
 
				-    train.reset_mock()
			
 
				-    main(**kwargs)
			
 
				-
			
 
				-    assert train.call_count == 1
			
 
				-
			
 
				-    args, kwargs = train.call_args
			
 
				-    train_dataloader, eval_dataloader = args[1:3]
			
 
				-    assert isinstance(train_dataloader.batch_sampler, LengthBasedBatchSampler)
			
 
				-    assert isinstance(eval_dataloader.batch_sampler, LengthBasedBatchSampler)
			
 
				-
			
 
				-    kwargs["batching_strategy"] = "none"
			
 
				-
			
 
				-    with pytest.raises(ValueError):
			
 
				-        main(**kwargs)