Преглед на файлове

Merge branch 'main' into chatbot-e2e

Kai Wu преди 1 година
родител
ревизия
1623b7588b
променени са 63 файла, в които са добавени 16904 реда и са изтрити 1672 реда
  1. 0 0
      .github/scripts/check_copyright_header.py
  2. 0 0
      .github/scripts/markdown_link_check_config.json
  3. 1 1
      scripts/spellcheck.sh
  4. 2 2
      scripts/spellcheck_conf/spellcheck.yaml
  5. 37 0
      scripts/spellcheck_conf/wordlist.txt
  6. 5 5
      .github/workflows/spellcheck.yml
  7. 4 4
      CONTRIBUTING.md
  8. 10 9
      docs/multi_gpu.md
  9. 8 6
      docs/single_gpu.md
  10. 1 1
      pyproject.toml
  11. 7 17
      recipes/README.md
  12. 12 12
      recipes/benchmarks/fmbench/README.md
  13. BIN
      recipes/benchmarks/fmbench/img/business_summary.png
  14. 9 10
      recipes/benchmarks/inference_throughput/on-prem/README.md
  15. 2 4
      recipes/benchmarks/inference_throughput/on-prem/vllm/chat_vllm_benchmark.py
  16. 2 3
      recipes/benchmarks/inference_throughput/on-prem/vllm/parameters.json
  17. 2 4
      recipes/benchmarks/inference_throughput/on-prem/vllm/pretrained_vllm_benchmark.py
  18. 4 2
      recipes/finetuning/README.md
  19. 7 7
      recipes/finetuning/multigpu_finetuning.md
  20. 6 6
      recipes/finetuning/singlegpu_finetuning.md
  21. 0 130
      recipes/inference/llama_web_ui/Llama2_Gradio.ipynb
  22. 0 25
      recipes/inference/llama_web_ui/README.md
  23. 0 3
      recipes/inference/llama_web_ui/requirements.txt
  24. 0 27
      recipes/inference/llama_web_ui/streamlit_llama2.py
  25. 1 1
      recipes/inference/local_inference/README.md
  26. 147 0
      recipes/inference/mobile_inference/android_inference/README.md
  27. 14 0
      recipes/inference/mobile_inference/android_inference/mlc-package-config.json
  28. 14 0
      recipes/inference/mobile_inference/android_inference/requirements.txt
  29. 2 4
      recipes/inference/model_servers/README.md
  30. 1 1
      recipes/inference/model_servers/llama-on-prem.md
  31. 21 99
      recipes/llama_api_providers/Azure_API_example/azure_api_example.ipynb
  32. 89 109
      recipes/llama_api_providers/OctoAI_API_examples/Getting_to_know_Llama.ipynb
  33. 24 34
      recipes/llama_api_providers/OctoAI_API_examples/HelloLlamaCloud.ipynb
  34. 67 143
      recipes/llama_api_providers/OctoAI_API_examples/LiveData.ipynb
  35. 27 31
      recipes/llama_api_providers/OctoAI_API_examples/Llama2_Gradio.ipynb
  36. 23 29
      recipes/llama_api_providers/OctoAI_API_examples/RAG_Chatbot_example/RAG_Chatbot_Example.ipynb
  37. 3 3
      recipes/llama_api_providers/OctoAI_API_examples/RAG_Chatbot_example/requirements.txt
  38. 79 126
      recipes/llama_api_providers/OctoAI_API_examples/VideoSummary.ipynb
  39. 937 0
      recipes/llama_api_providers/llama3_cookbook_groq.ipynb
  40. 1 1
      recipes/quickstart/Getting_to_know_Llama.ipynb
  41. 1 1
      recipes/use_cases/LiveData.ipynb
  42. 698 0
      recipes/use_cases/MediaGen.ipynb
  43. 7 1
      recipes/use_cases/README.md
  44. 34 25
      recipes/use_cases/agents/langchain/README.md
  45. 0 698
      recipes/use_cases/agents/langchain/langgraph-agent.ipynb
  46. 931 0
      recipes/use_cases/agents/langchain/langgraph-custom-agent.ipynb
  47. 831 0
      recipes/use_cases/agents/langchain/langgraph-tool-calling-agent.ipynb
  48. 626 82
      recipes/use_cases/agents/langchain/tool-calling-agent.ipynb
  49. 10262 0
      recipes/use_cases/chatbots/sales_bot/Musical_instruments_reviews.csv
  50. 668 0
      recipes/use_cases/chatbots/sales_bot/SalesBot.ipynb
  51. 1261 0
      recipes/use_cases/llamaindex_cookbook.ipynb
  52. 3 2
      src/llama_recipes/inference/llm.py
  53. 1 1
      src/llama_recipes/utils/config_utils.py
  54. 0 0
      src/tests/conftest.py
  55. 4 1
      tests/datasets/test_custom_dataset.py
  56. 1 0
      tests/datasets/test_grammar_datasets.py
  57. 1 0
      tests/datasets/test_samsum_datasets.py
  58. 3 1
      tests/test_batching.py
  59. 2 1
      tests/test_chat_completion.py
  60. 0 0
      src/tests/test_finetuning.py
  61. 0 0
      src/tests/test_finetuning_data_formatter.py
  62. 0 0
      src/tests/test_sampler.py
  63. 1 0
      tests/test_train_utils.py


+ 1 - 1
scripts/spellcheck.sh

@@ -19,5 +19,5 @@ done
 if [ ! "$sources_arg" ]; then
 	echo "No files to spellcheck"
 else
-	pyspelling -c scripts/spellcheck_conf/spellcheck.yaml --name Markdown $sources_arg
+	pyspelling -c .github/scripts/spellcheck_conf/spellcheck.yaml --name Markdown $sources_arg
 fi

+ 2 - 2
scripts/spellcheck_conf/spellcheck.yaml

@@ -5,8 +5,8 @@ matrix:
     d: en_US
   dictionary:
     wordlists:
-    - scripts/spellcheck_conf/wordlist.txt
-    output: scripts/spellcheck_conf/wordlist.dic
+    - .github/scripts/spellcheck_conf/wordlist.txt
+    output: .github/scripts/spellcheck_conf/wordlist.dic
     encoding: utf-8
   pipeline:
   - pyspelling.filters.context:

+ 37 - 0
scripts/spellcheck_conf/wordlist.txt

@@ -1314,3 +1314,40 @@ AgentExecutor
 LangGraph
 langgraph
 vectorstore
+CMake
+Chipset
+JBR
+JNI
+MLCChat
+MTP
+MacBook
+Moreau
+NDK
+NDK's
+OSX
+OnePlus
+OxygenOS
+SoC
+Sonoma
+TVM
+Thierry
+Wifi
+chipset
+feb
+moreau
+octo
+rustc
+rustup
+sha
+tmoreau
+toolchain
+wifi
+AgentFinish
+ReAct
+customizable
+Kaggle
+SalesBot
+Weaviate
+MediaGen
+SDXL
+SVD

+ 5 - 5
.github/workflows/spellcheck.yml

@@ -20,11 +20,11 @@ jobs:
         uses: gaurav-nelson/github-action-markdown-link-check@1.0.13
         with:
           use-verbose-mode: 'yes'
-          config-file: "scripts/markdown_link_check_config.json"
+          config-file: ".github/scripts/markdown_link_check_config.json"
 
       - name: Get changed files
         id: changed-files
-        uses: tj-actions/changed-files@v29.0.4
+        uses: tj-actions/changed-files@v41.0.0
         with:
 
           files: |
@@ -42,7 +42,7 @@ jobs:
 
       - name: Get changed files
         id: changed-files
-        uses: tj-actions/changed-files@v29.0.4
+        uses: tj-actions/changed-files@v41.0.0
         with:
           files: |
             **/*.md
@@ -56,11 +56,11 @@ jobs:
           if [ ! "$sources" ]; then
             echo "No files to spellcheck"
           else
-            pyspelling -c $GITHUB_WORKSPACE/scripts/spellcheck_conf/spellcheck.yaml --name Markdown $sources
+            pyspelling -c $GITHUB_WORKSPACE/.github/scripts/spellcheck_conf/spellcheck.yaml --name Markdown $sources
           fi
 
       - name: In the case of misspellings
         if: ${{ failure() }}
         run: |
           echo "Please fix the misspellings. If you are sure about some of them, "
-          echo "so append those to scripts/spellcheck_conf/wordlist.txt"
+          echo "so append those to .github/scripts/spellcheck_conf/wordlist.txt"

+ 4 - 4
CONTRIBUTING.md

@@ -43,17 +43,17 @@ For development and contributing to llama-recipes please install from source wit
 pip install -U pip setuptools
 pip install --extra-index-url https://download.pytorch.org/whl/test/cu118 -e .[tests,auditnlg,vllm]
 ```
-The unit tests can be found in the [tests](./tests/) folder and you can run them from the main directory using:
+The unit tests can be found in the [src/tests](./src/tests/) folder and you can run them from the main directory using:
 ```
-python -m pytest tests/
+python -m pytest src/tests/
 ```
 To run all tests of a single file you can give the filename directly:
 ```
-python -m pytest tests/test_finetuning.py
+python -m pytest src/tests/test_finetuning.py
 ```
 To run a specific test you can filter for its name with
 ```
-python -m pytest tests/test_finetuning.py -k test_finetuning_peft
+python -m pytest src/tests/test_finetuning.py -k test_finetuning_peft
 ```
 To add a new test simply create a new test file under the tests folder (filename has to start with `test_`).
 Group tests spanning the same feature in the same file and create a subfolder if the tests are very extensive.

+ 10 - 9
docs/multi_gpu.md

@@ -24,7 +24,7 @@ This runs with the `samsum_dataset` for summarization application by default.
 
 ```bash
 
-torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp --model_name /patht_of_model_folder/8B --use_peft --peft_method lora --output_dir Path/to/save/PEFT/model
+torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --use_peft --peft_method lora --output_dir Path/to/save/PEFT/model
 
 ```
 
@@ -34,7 +34,7 @@ The args used in the command above are:
 
 * `--use_peft` boolean flag to enable PEFT methods in the script
 
-* `--peft_method` to specify the PEFT method, here we use `lora` other options are `llama_adapter`, `prefix`.
+* `--peft_method` to specify the PEFT method, here we use `lora` other options are `llama_adapter`.
 
 We use `torchrun` here to spawn multiple processes for FSDP.
 
@@ -43,7 +43,7 @@ We use `torchrun` here to spawn multiple processes for FSDP.
 Setting `use_fast_kernels` will enable using of Flash Attention or Xformer memory-efficient kernels based on the hardware being used. This would speed up the fine-tuning job. This has been enabled in `optimum` library from HuggingFace as a one-liner API, please read more [here](https://pytorch.org/blog/out-of-the-box-acceleration/).
 
 ```bash
-torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp --model_name /patht_of_model_folder/8B --use_peft --peft_method lora --output_dir Path/to/save/PEFT/model --use_fast_kernels
+torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --use_peft --peft_method lora --output_dir Path/to/save/PEFT/model --use_fast_kernels
 ```
 
 ### Fine-tuning using FSDP Only
@@ -52,7 +52,7 @@ If interested in running full parameter finetuning without making use of PEFT me
 
 ```bash
 
-torchrun --nnodes 1 --nproc_per_node 8  examples/finetuning.py --enable_fsdp --model_name /patht_of_model_folder/8B --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --use_fast_kernels
+torchrun --nnodes 1 --nproc_per_node 8  examples/finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --use_fast_kernels
 
 ```
 
@@ -62,7 +62,7 @@ If you are interested in running full parameter fine-tuning on the 70B model, yo
 
 ```bash
 
-torchrun --nnodes 1 --nproc_per_node 8 examples/finetuning.py --enable_fsdp --low_cpu_fsdp --pure_bf16 --model_name /patht_of_model_folder/70B --batch_size_training 1 --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned
+torchrun --nnodes 1 --nproc_per_node 8 examples/finetuning.py --enable_fsdp --low_cpu_fsdp --pure_bf16 --model_name /path_of_model_folder/70B --batch_size_training 1 --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned
 
 ```
 
@@ -95,16 +95,16 @@ To run with each of the datasets set the `dataset` flag in the command as shown
 
 ```bash
 # grammer_dataset
-torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp  --model_name /patht_of_model_folder/8B --use_peft --peft_method lora --dataset grammar_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned  --pure_bf16 --output_dir Path/to/save/PEFT/model
+torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp  --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset grammar_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned  --pure_bf16 --output_dir Path/to/save/PEFT/model
 
 # alpaca_dataset
 
-torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp  --model_name /patht_of_model_folder/8B --use_peft --peft_method lora --dataset alpaca_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --output_dir Path/to/save/PEFT/model
+torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp  --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset alpaca_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --output_dir Path/to/save/PEFT/model
 
 
 # samsum_dataset
 
-torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp --model_name /patht_of_model_folder/8B --use_peft --peft_method lora --dataset samsum_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --output_dir Path/to/save/PEFT/model
+torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset samsum_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --output_dir Path/to/save/PEFT/model
 
 ```
 
@@ -138,8 +138,9 @@ It lets us specify the training settings for everything from `model_name` to `da
     mixed_precision: bool=True
     val_batch_size: int=1
     dataset = "samsum_dataset"
-    peft_method: str = "lora" # None,llama_adapter, prefix
+    peft_method: str = "lora" # None, llama_adapter (Caution: llama_adapter is currently not supported with FSDP)
     use_peft: bool=False
+    from_peft_checkpoint: str="" # if not empty and use_peft=True, will load the peft checkpoint and resume the fine-tuning on that checkpoint
     output_dir: str = "PATH/to/save/PEFT/model"
     freeze_layers: bool = False
     num_freeze_layers: int = 1

+ 8 - 6
docs/single_gpu.md

@@ -20,14 +20,14 @@ Get access to a machine with one GPU or if using a multi-GPU machine please make
 
 ```bash
 
-python -m llama_recipes.finetuning  --use_peft --peft_method lora --quantization --use_fp16 --model_name /patht_of_model_folder/8B --output_dir Path/to/save/PEFT/model
+python -m llama_recipes.finetuning  --use_peft --peft_method lora --quantization --use_fp16 --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model
 
 ```
 The args used in the command above are:
 
 * `--use_peft` boolean flag to enable PEFT methods in the script
 
-* `--peft_method` to specify the PEFT method, here we use `lora` other options are `llama_adapter`, `prefix`.
+* `--peft_method` to specify the PEFT method, here we use `lora` other options are `llama_adapter`.
 
 * `--quantization` boolean flag to enable int8 quantization
 
@@ -51,16 +51,16 @@ to run with each of the datasets set the `dataset` flag in the command as shown
 ```bash
 # grammer_dataset
 
-python -m llama_recipes.finetuning  --use_peft --peft_method lora --quantization  --dataset grammar_dataset --model_name /patht_of_model_folder/8B --output_dir Path/to/save/PEFT/model
+python -m llama_recipes.finetuning  --use_peft --peft_method lora --quantization  --dataset grammar_dataset --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model
 
 # alpaca_dataset
 
-python -m llama_recipes.finetuning  --use_peft --peft_method lora --quantization  --dataset alpaca_dataset --model_name /patht_of_model_folder/8B --output_dir Path/to/save/PEFT/model
+python -m llama_recipes.finetuning  --use_peft --peft_method lora --quantization  --dataset alpaca_dataset --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model
 
 
 # samsum_dataset
 
-python -m llama_recipes.finetuning  --use_peft --peft_method lora --quantization  --dataset samsum_dataset --model_name /patht_of_model_folder/8B --output_dir Path/to/save/PEFT/model
+python -m llama_recipes.finetuning  --use_peft --peft_method lora --quantization  --dataset samsum_dataset --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model
 
 ```
 
@@ -94,8 +94,9 @@ It let us specify the training settings, everything from `model_name` to `datase
     mixed_precision: bool=True
     val_batch_size: int=1
     dataset = "samsum_dataset"
-    peft_method: str = "lora" # None,llama_adapter, prefix
+    peft_method: str = "lora" # None, llama_adapter (Caution: llama_adapter is currently not supported with FSDP)
     use_peft: bool=False
+    from_peft_checkpoint: str="" # if not empty and use_peft=True, will load the peft checkpoint and resume the fine-tuning on that checkpoint
     output_dir: str = "PATH/to/save/PEFT/model"
     freeze_layers: bool = False
     num_freeze_layers: int = 1
@@ -112,6 +113,7 @@ It let us specify the training settings, everything from `model_name` to `datase
     flop_counter_start: int = 3 # The step to start profiling, default is 3, which means after 3 steps of warmup stage, the profiler will start to count flops.
     use_profiler: bool = False # Enable pytorch profiler, can not be used with flop counter at the same time.
     profiler_dir: str = "PATH/to/save/profiler/results" # will be used if using profiler
+
 ```
 
 * [Datasets config file](../src/llama_recipes/configs/datasets.py) provides the available options for datasets.

+ 1 - 1
pyproject.toml

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "llama-recipes"
-version = "0.0.1"
+version = "0.0.2"
 authors = [
   { name="Hamid Shojanazeri", email="hamidnazeri@meta.com" },
   { name="Matthias Reso", email="mreso@meta.com" },

Файловите разлики са ограничени, защото са твърде много
+ 7 - 17
recipes/README.md


Файловите разлики са ограничени, защото са твърде много
+ 12 - 12
recipes/benchmarks/fmbench/README.md


BIN
recipes/benchmarks/fmbench/img/business_summary.png


+ 9 - 10
recipes/benchmarks/inference_throughput/on-prem/README.md

@@ -1,26 +1,26 @@
 # Llama-On-Prem-Benchmark
-This folder contains code to run inference benchmark for Llama 2 models on-prem with popular serving frameworks.
-The benchmark will focus on overall inference **throughput** for running containers on one instance (single or multiple GPUs) that you can acquire from cloud service providers such as Azure and AWS. You can also run this benchmark on local laptop or desktop.  
+This folder contains code to run inference benchmark for Meta Llama 3 models on-prem with popular serving frameworks.
+The benchmark will focus on overall inference **throughput** for running containers on one instance (single or multiple GPUs) that you can acquire from cloud service providers such as Azure and AWS. You can also run this benchmark on local laptop or desktop.
 We support benchmark on these serving framework:
 * [vLLM](https://github.com/vllm-project/vllm)
 
 
 # vLLM - Getting Started
 
-To get started, we first need to deploy containers on-prem as a API host. Follow the guidance [here](../../../inference/model_servers/llama-on-prem.md#setting-up-vllm-with-llama-2) to deploy vLLM on-prem.
+To get started, we first need to deploy containers on-prem as a API host. Follow the guidance [here](../../../inference/model_servers/llama-on-prem.md#setting-up-vllm-with-llama-3) to deploy vLLM on-prem.
 
-Note that in common scenario which overall throughput is important, we suggest you prioritize deploying as many model replicas as possible to reach higher overall throughput and request-per-second (RPS), comparing to deploy one model container among multiple GPUs for model parallelism. Additionally, as deploying multiple model replicas, there is a need for a higher level wrapper to handle the load balancing which here has been simulated in the benchmark scripts.  
-For example, we have an instance from Azure that has 8xA100 80G GPUs, and we want to deploy the Llama 2 70B chat model, which is around 140GB with FP16. So for deployment we can do:
+Note that in common scenario which overall throughput is important, we suggest you prioritize deploying as many model replicas as possible to reach higher overall throughput and request-per-second (RPS), comparing to deploy one model container among multiple GPUs for model parallelism. Additionally, as deploying multiple model replicas, there is a need for a higher level wrapper to handle the load balancing which here has been simulated in the benchmark scripts.
+For example, we have an instance from Azure that has 8xA100 80G GPUs, and we want to deploy the Meta Llama 3 70B instruct model, which is around 140GB with FP16. So for deployment we can do:
 * 1x70B model parallel on 8 GPUs, each GPU RAM takes around 17.5GB for loading model weights.
 * 2x70B models each use 4 GPUs, each GPU RAM takes around 35GB for loading model weights.
 * 4x70B models each use 2 GPUs, each GPU RAM takes around 70GB for loading model weights. (Preferred configuration for max overall throughput. Note that you will have 4 endpoints hosted on different ports and the benchmark script will route requests into each model equally)
 
 Here are examples for deploying 2x70B chat models over 8 GPUs with vLLM.
 ```
-CUDA_VISIBLE_DEVICES=0,1,2,3 python -m vllm.entrypoints.openai.api_server  --model meta-llama/Llama-2-70b-chat-hf --tensor-parallel-size 4 --disable-log-requests --port 8000 
-CUDA_VISIBLE_DEVICES=4,5,6,7 python -m vllm.entrypoints.openai.api_server  --model meta-llama/Llama-2-70b-chat-hf --tensor-parallel-size 4 --disable-log-requests --port 8001 
+CUDA_VISIBLE_DEVICES=0,1,2,3 python -m vllm.entrypoints.openai.api_server  --model meta-llama/Meta-Llama-3-70B-Instruct --tensor-parallel-size 4 --disable-log-requests --port 8000
+CUDA_VISIBLE_DEVICES=4,5,6,7 python -m vllm.entrypoints.openai.api_server  --model meta-llama/Meta-Llama-3-70B-Instruct --tensor-parallel-size 4 --disable-log-requests --port 8001
 ```
-Once you have finished deployment, you can use the command below to run benchmark scripts in a separate terminal. 
+Once you have finished deployment, you can use the command below to run benchmark scripts in a separate terminal.
 
 ```
 python chat_vllm_benchmark.py
@@ -32,9 +32,8 @@ If you are going to use [Azure AI content check](https://azure.microsoft.com/en-
 pip install azure-ai-contentsafety azure-core
 ```
 Besides chat models, we also provide benchmark scripts for running pretrained models for text completion tasks. To better simulate the real traffic, we generate configurable random token prompt as input. In this process, we select vocabulary that is longer than 2 tokens so the generated words are closer to the English, rather than symbols.
-However, random token prompts can't be applied for chat model benchmarks, since the chat model expects a valid question. By feeding random prompts, chat models rarely provide answers that is meeting our ```MAX_NEW_TOKEN``` requirement, defeating the purpose of running throughput benchmarks. Hence for chat models, the questions are copied over to form long inputs such as for 2k and 4k inputs.   
+However, random token prompts can't be applied for chat model benchmarks, since the chat model expects a valid question. By feeding random prompts, chat models rarely provide answers that is meeting our ```MAX_NEW_TOKEN``` requirement, defeating the purpose of running throughput benchmarks. Hence for chat models, the questions are copied over to form long inputs such as for 2k and 4k inputs.
 To run pretrained model benchmark, follow the command below.
 ```
 python pretrained_vllm_benchmark.py
 ```
-

+ 2 - 4
recipes/benchmarks/inference_throughput/on-prem/vllm/chat_vllm_benchmark.py

@@ -40,8 +40,6 @@ MODEL_HEADERS = params["MODEL_HEADERS"]
 SAFE_CHECK = params["SAFE_CHECK"]
 # Threshold for tokens per second below which we deem the query to be slow
 THRESHOLD_TPS = params["THRESHOLD_TPS"] 
-# Default Llama tokenizer, replace with your own tokenizer 
-TOKENIZER_PATH = params["TOKENIZER_PATH"] 
 TEMPERATURE = params["TEMPERATURE"]
 TOP_P = params["TOP_P"]
 # Add your model endpoints here, specify the port number. You can acquire the endpoint when creating a on-prem server like vLLM.
@@ -55,8 +53,8 @@ else:
     print("No available GPUs")
 
 
-# This tokenizer is downloaded from Azure model catalog for each specific models. The main purpose is to decode the reponses for token calculation
-tokenizer = transformers.AutoTokenizer.from_pretrained(TOKENIZER_PATH)
+# This tokenizer is downloaded from HuggingFace based on the model path you set. Note Llama 3 use a different tokenizer compare to Llama 2
+tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_PATH)
 
 num_token_input_prompt = len(tokenizer.encode(PROMPT))
 print(f"Number of token for input prompt: {num_token_input_prompt}")

+ 2 - 3
recipes/benchmarks/inference_throughput/on-prem/vllm/parameters.json

@@ -1,15 +1,14 @@
 {
     "MAX_NEW_TOKENS" : 256,
     "CONCURRENT_LEVELS" : [1, 2, 4, 8, 16, 32, 64, 128, 256],
-    "MODEL_PATH" : "meta-llama/Llama-2-7b-chat-hf",
+    "MODEL_PATH" : "meta-llama/Meta-Llama-3-70B-Instruct",
     "MODEL_HEADERS" : {"Content-Type": "application/json"},
     "SAFE_CHECK" : true,
     "THRESHOLD_TPS" : 7,
-    "TOKENIZER_PATH" : "../../tokenizer",
     "RANDOM_PROMPT_LENGTH" : 1000,
     "TEMPERATURE" : 0.6,
     "TOP_P" : 0.9,
     "MODEL_ENDPOINTS" : [
         "http://localhost:8000/v1/chat/completions"
     ]
-}
+}

+ 2 - 4
recipes/benchmarks/inference_throughput/on-prem/vllm/pretrained_vllm_benchmark.py

@@ -36,8 +36,6 @@ MODEL_HEADERS = params["MODEL_HEADERS"]
 SAFE_CHECK = params["SAFE_CHECK"]
 # Threshold for tokens per second below which we deem the query to be slow
 THRESHOLD_TPS = params["THRESHOLD_TPS"] 
-# Replace with your own tokenizer 
-TOKENIZER_PATH = params["TOKENIZER_PATH"] 
 RANDOM_PROMPT_LENGTH = params["RANDOM_PROMPT_LENGTH"]
 TEMPERATURE = params["TEMPERATURE"]
 TOP_P = params["TOP_P"]
@@ -52,8 +50,8 @@ else:
     print("No available GPUs")
 
 
-# This tokenizer is downloaded from Azure model catalog for each specific models. The main purpose is to decode the reponses for token calculation
-tokenizer = transformers.AutoTokenizer.from_pretrained(TOKENIZER_PATH)
+# This tokenizer is downloaded from HuggingFace based on the model path you set. Note Llama 3 use a different tokenizer compare to Llama 2
+tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_PATH)
 
 # Select vocabulary that is longer than 2 tokens (closer to real words) and close to the English (not foolproof)
 vocab = [token for token in tokenizer.get_vocab().keys() if len(token) > 2 and all(ord(c) < 128 for c in token)]

+ 4 - 2
recipes/finetuning/README.md

@@ -48,8 +48,9 @@ It lets us specify the training settings for everything from `model_name` to `da
     mixed_precision: bool=True
     val_batch_size: int=1
     dataset = "samsum_dataset"
-    peft_method: str = "lora" # None,llama_adapter, prefix
+    peft_method: str = "lora" # None, llama_adapter (Caution: llama_adapter is currently not supported with FSDP)
     use_peft: bool=False
+    from_peft_checkpoint: str="" # if not empty and use_peft=True, will load the peft checkpoint and resume the fine-tuning on that checkpoint
     output_dir: str = "PATH/to/save/PEFT/model"
     freeze_layers: bool = False
     num_freeze_layers: int = 1
@@ -66,6 +67,7 @@ It lets us specify the training settings for everything from `model_name` to `da
     flop_counter_start: int = 3 # The step to start profiling, default is 3, which means after 3 steps of warmup stage, the profiler will start to count flops.
     use_profiler: bool = False # Enable pytorch profiler, can not be used with flop counter at the same time.
     profiler_dir: str = "PATH/to/save/profiler/results" # will be used if using profiler
+
 ```
 
 * [Datasets config file](../../src/llama_recipes/configs/datasets.py) provides the available options for datasets.
@@ -99,7 +101,7 @@ It lets us specify the training settings for everything from `model_name` to `da
 You can enable [W&B](https://wandb.ai/) experiment tracking by using `use_wandb` flag as below. You can change the project name, entity and other `wandb.init` arguments in `wandb_config`.
 
 ```bash
-python -m llama_recipes.finetuning --use_peft --peft_method lora --quantization --model_name /patht_of_model_folder/8B --output_dir Path/to/save/PEFT/model --use_wandb
+python -m llama_recipes.finetuning --use_peft --peft_method lora --quantization --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model --use_wandb
 ```
 You'll be able to access a dedicated project or run link on [wandb.ai](https://wandb.ai) and see your dashboard like the one below.
 <div style="display: flex;">

+ 7 - 7
recipes/finetuning/multigpu_finetuning.md

@@ -23,7 +23,7 @@ Get access to a machine with multiple GPUs (in this case we tested with 4 A100 a
 <details open>
 <summary>Single-node Multi-GPU</summary>
 
-    torchrun --nnodes 1 --nproc_per_node 4  finetuning.py --enable_fsdp --model_name /patht_of_model_folder/8B --use_peft --peft_method lora --output_dir Path/to/save/PEFT/model
+    torchrun --nnodes 1 --nproc_per_node 4  finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --use_peft --peft_method lora --output_dir Path/to/save/PEFT/model
 
 </details>
 
@@ -49,7 +49,7 @@ The args used in the command above are:
 If interested in running full parameter finetuning without making use of PEFT methods, please use the following command. Make sure to change the `nproc_per_node` to your available GPUs. This has been tested with `BF16` on 8xA100, 40GB GPUs.
 
 ```bash
-torchrun --nnodes 1 --nproc_per_node 8  finetuning.py --enable_fsdp --model_name /patht_of_model_folder/8B --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --use_fast_kernels
+torchrun --nnodes 1 --nproc_per_node 8  finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --use_fast_kernels
 ```
 
 ### Using less CPU memory (FSDP on 70B model)
@@ -57,7 +57,7 @@ torchrun --nnodes 1 --nproc_per_node 8  finetuning.py --enable_fsdp --model_name
 If you are running full parameter fine-tuning on the 70B model, you can enable `low_cpu_fsdp` mode as the following command. This option will load model on rank0 only before moving model to devices to construct FSDP. This can dramatically save cpu memory when loading large models like 70B (on a 8-gpu node, this reduces cpu memory from 2+T to 280G for 70B model). This has been tested with `BF16` on 16xA100, 80GB GPUs.
 
 ```bash
-torchrun --nnodes 1 --nproc_per_node 8 finetuning.py --enable_fsdp --low_cpu_fsdp --pure_bf16 --model_name /patht_of_model_folder/70B --batch_size_training 1 --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned
+torchrun --nnodes 1 --nproc_per_node 8 finetuning.py --enable_fsdp --low_cpu_fsdp --pure_bf16 --model_name /path_of_model_folder/70B --batch_size_training 1 --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned
 ```
 
 
@@ -79,16 +79,16 @@ To run with each of the datasets set the `dataset` flag in the command as shown
 
 ```bash
 # grammer_dataset
-torchrun --nnodes 1 --nproc_per_node 4  finetuning.py --enable_fsdp  --model_name /patht_of_model_folder/8B --use_peft --peft_method lora --dataset grammar_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned  --pure_bf16 --output_dir Path/to/save/PEFT/model
+torchrun --nnodes 1 --nproc_per_node 4  finetuning.py --enable_fsdp  --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset grammar_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned  --pure_bf16 --output_dir Path/to/save/PEFT/model
 
 # alpaca_dataset
 
-torchrun --nnodes 1 --nproc_per_node 4  finetuning.py --enable_fsdp  --model_name /patht_of_model_folder/8B --use_peft --peft_method lora --dataset alpaca_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --output_dir Path/to/save/PEFT/model
+torchrun --nnodes 1 --nproc_per_node 4  finetuning.py --enable_fsdp  --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset alpaca_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --output_dir Path/to/save/PEFT/model
 
 
 # samsum_dataset
 
-torchrun --nnodes 1 --nproc_per_node 4  finetuning.py --enable_fsdp --model_name /patht_of_model_folder/8B --use_peft --peft_method lora --dataset samsum_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --output_dir Path/to/save/PEFT/model
+torchrun --nnodes 1 --nproc_per_node 4  finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset samsum_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --output_dir Path/to/save/PEFT/model
 
 ```
 
@@ -103,7 +103,7 @@ This will require to set the Sharding strategy in [fsdp config](../../src/llama_
 
 ```bash
 
-torchrun --nnodes 4 --nproc_per_node 8 ./finetuning.py --enable_fsdp --low_cpu_fsdp --fsdp_config.pure_bf16 --model_name /patht_of_model_folder/70B --batch_size_training 1 --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --hsdp --sharding_group_size n --replica_group_size world_size/n
+torchrun --nnodes 4 --nproc_per_node 8 ./finetuning.py --enable_fsdp --low_cpu_fsdp --fsdp_config.pure_bf16 --model_name /path_of_model_folder/70B --batch_size_training 1 --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --hsdp --sharding_group_size n --replica_group_size world_size/n
 
 ```
 

+ 6 - 6
recipes/finetuning/singlegpu_finetuning.md

@@ -16,7 +16,7 @@ To run fine-tuning on a single GPU, we will make use of two packages:
 ## How to run it?
 
 ```bash
-python -m finetuning.py  --use_peft --peft_method lora --quantization --use_fp16 --model_name /patht_of_model_folder/8B --output_dir Path/to/save/PEFT/model
+python finetuning.py  --use_peft --peft_method lora --quantization --use_fp16 --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model
 ```
 The args used in the command above are:
 
@@ -34,7 +34,7 @@ Currently 3 open source datasets are supported that can be found in [Datasets co
 
 * `grammar_dataset` : use this [notebook](../../src/llama_recipes/datasets/grammar_dataset/grammar_dataset_process.ipynb) to pull and process the Jfleg and C4 200M datasets for grammar checking.
 
-* `alpaca_dataset` : to get this open source data please download the `aplaca.json` to `dataset` folder.
+* `alpaca_dataset` : to get this open source data please download the `alpaca.json` to `dataset` folder.
 
 
 ```bash
@@ -46,18 +46,18 @@ wget -P ../../src/llama_recipes/datasets https://raw.githubusercontent.com/tatsu
 to run with each of the datasets set the `dataset` flag in the command as shown below:
 
 ```bash
-# grammer_dataset
+# grammar_dataset
 
-python -m finetuning.py  --use_peft --peft_method lora --quantization  --dataset grammar_dataset --model_name /patht_of_model_folder/8B --output_dir Path/to/save/PEFT/model
+python -m finetuning.py  --use_peft --peft_method lora --quantization  --dataset grammar_dataset --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model
 
 # alpaca_dataset
 
-python -m finetuning.py  --use_peft --peft_method lora --quantization  --dataset alpaca_dataset --model_name /patht_of_model_folder/8B --output_dir Path/to/save/PEFT/model
+python -m finetuning.py  --use_peft --peft_method lora --quantization  --dataset alpaca_dataset --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model
 
 
 # samsum_dataset
 
-python -m finetuning.py  --use_peft --peft_method lora --quantization  --dataset samsum_dataset --model_name /patht_of_model_folder/8B --output_dir Path/to/save/PEFT/model
+python -m finetuning.py  --use_peft --peft_method lora --quantization  --dataset samsum_dataset --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model
 
 ```
 

+ 0 - 130
recipes/inference/llama_web_ui/Llama2_Gradio.ipynb

@@ -1,130 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e4532411",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# TODO REFACTOR: Integrate code from _legacy/inference.py into this notebook"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "47a9adb3",
-   "metadata": {},
-   "source": [
-    "## This demo app shows how to query Llama 2 using the Gradio UI.\n",
-    "\n",
-    "Since we are using Replicate in this example, you will need to replace `<your replicate api token>` with your API token.\n",
-    "\n",
-    "To get the Replicate token: \n",
-    "\n",
-    "- You will need to first sign in with Replicate with your github account\n",
-    "- Then create a free API token [here](https://replicate.com/account/api-tokens) that you can use for a while \n",
-    "\n",
-    "**Note** After the free trial ends, you will need to enter billing info to continue to use Llama2 hosted on Replicate.\n",
-    "\n",
-    "To run this example:\n",
-    "- Set up your Replicate API token and enter it in place of `<your replicate api token>`\n",
-    "- Run the notebook\n",
-    "- Enter your question and click Submit\n",
-    "\n",
-    "In the notebook or a browser with URL http://127.0.0.1:7860 you should see a UI with your answer."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "928041cc",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Init param `input` is deprecated, please use `model_kwargs` instead.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Running on local URL:  http://127.0.0.1:7860\n",
-      "\n",
-      "To create a public link, set `share=True` in `launch()`.\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div><iframe src=\"http://127.0.0.1:7860/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/plain": []
-     },
-     "execution_count": 1,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from langchain.schema import AIMessage, HumanMessage\n",
-    "import gradio as gr\n",
-    "from langchain.llms import Replicate\n",
-    "import os\n",
-    "\n",
-    "os.environ[\"REPLICATE_API_TOKEN\"] = \"<your replicate api token>\"\n",
-    "\n",
-    "llama2_13b_chat = \"meta/llama-2-13b-chat:f4e2de70d66816a838a89eeeb621910adffb0dd0baba3976c96980970978018d\"\n",
-    "\n",
-    "llm = Replicate(\n",
-    "    model=llama2_13b_chat,\n",
-    "    model_kwargs={\"temperature\": 0.01, \"top_p\": 1, \"max_new_tokens\":500}\n",
-    ")\n",
-    "\n",
-    "\n",
-    "def predict(message, history):\n",
-    "    history_langchain_format = []\n",
-    "    for human, ai in history:\n",
-    "        history_langchain_format.append(HumanMessage(content=human))\n",
-    "        history_langchain_format.append(AIMessage(content=ai))\n",
-    "    history_langchain_format.append(HumanMessage(content=message))\n",
-    "    gpt_response = llm(message) #history_langchain_format)\n",
-    "    return gpt_response#.content\n",
-    "\n",
-    "gr.ChatInterface(predict).launch()"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.18"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}

+ 0 - 25
recipes/inference/llama_web_ui/README.md

@@ -1,25 +0,0 @@
-## Quick Web UI for Llama2 Chat
-If you prefer to see Llama2 in action in a web UI, instead of the notebooks above, you can try one of the two methods:
-
-### Running [Streamlit](https://streamlit.io/) with Llama2
-Open a Terminal, run the following commands:
-```
-pip install streamlit langchain replicate
-git clone https://github.com/facebookresearch/llama-recipes
-cd llama-recipes/llama-demo-apps
-```
-
-Replace the `<your replicate api token>` in `streamlit_llama2.py` with your API token created [here](https://replicate.com/account/api-tokens) - for more info, see the note [above](#replicate_note).
-
-Then run the command `streamlit run streamlit_llama2.py` and you'll see on your browser the following UI with question and answer - you can enter new text question, click Submit, and see Llama2's answer:
-
-![](../../../docs/images/llama2-streamlit.png)
-![](../../../docs/images/llama2-streamlit2.png)
-
-### Running [Gradio](https://www.gradio.app/) with Llama2 (using [Replicate](Llama2_Gradio.ipynb) or [OctoAI](../../llama_api_providers/OctoAI_API_examples/Llama2_Gradio.ipynb))
-
-To see how to query Llama2 and get answers with the Gradio UI both from the notebook and web, just launch the notebook `Llama2_Gradio.ipynb`. For more info, on how to get set up with a token to power these apps, see the note on [Replicate](../../README.md#replicate_note) and [OctoAI](../../README.md##octoai_note).
-
-Then enter your question, click Submit. You'll see in the notebook or a browser with URL http://127.0.0.1:7860 the following UI:
-
-![](../../../docs/images/llama2-gradio.png)

+ 0 - 3
recipes/inference/llama_web_ui/requirements.txt

@@ -1,3 +0,0 @@
-streamlit
-langchain
-replicate

+ 0 - 27
recipes/inference/llama_web_ui/streamlit_llama2.py

@@ -1,27 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
-
-# TODO REFACTOR: Convert this to an ipynb notebook
-
-import streamlit as st
-from langchain.llms import Replicate
-import os
-
-st.title("Llama2-powered Streamlit App")
-
-with st.sidebar:
-    os.environ["REPLICATE_API_TOKEN"] = "<your replicate api token>"
-
-def generate_response(input_text):
-    llama2_13b_chat = "meta/llama-2-13b-chat:f4e2de70d66816a838a89eeeb621910adffb0dd0baba3976c96980970978018d"
-
-    llm = Replicate(
-        model=llama2_13b_chat,
-        model_kwargs={"temperature": 0.01, "top_p": 1, "max_new_tokens":500}
-    )
-    st.info(llm(input_text))
-
-with st.form("my_form"):
-    text = st.text_area("Enter text:", "What is Generative AI?")
-    submitted = st.form_submit_button("Submit")
-    generate_response(text)

+ 1 - 1
recipes/inference/local_inference/README.md

@@ -69,7 +69,7 @@ In case you have fine-tuned your model with pure FSDP and saved the checkpoints
 This is helpful if you have fine-tuned you model using FSDP only as follows:
 
 ```bash
-torchrun --nnodes 1 --nproc_per_node 8  recipes/finetuning/finetuning.py --enable_fsdp --model_name /patht_of_model_folder/7B --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16
+torchrun --nnodes 1 --nproc_per_node 8  recipes/finetuning/finetuning.py --enable_fsdp --model_name /path_of_model_folder/7B --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16
 ```
 Then convert your FSDP checkpoint to HuggingFace checkpoints using:
 ```bash

+ 147 - 0
recipes/inference/mobile_inference/android_inference/README.md

@@ -0,0 +1,147 @@
+# Running Llama3 8B Instruct on Android with MLC-LLM
+
+Author: Thierry Moreau - tmoreau@octo.ai
+
+# Overview
+In this tutorial we'll learn how to deploy Llama3 8B Instruct on an Android-based phone using MLC-LLM.
+
+Machine Learning Compilation for Large Language Models (MLC LLM) is a high-performance universal deployment solution that allows native deployment of any large language models with native APIs with compiler acceleration. The mission of this project is to enable everyone to develop, optimize and deploy AI models natively on everyone's devices with ML compilation techniques.
+
+You can read more about MLC-LLM at the following [link](https://github.com/mlc-ai/mlc-llm).
+
+MLC-LLM is also what powers the Llama3 inference APIs provided by [OctoAI](https://octo.ai/). You can use OctoAI for your Llama3 cloud-based inference needs by trying out the examples under the [following path](../../../llama_api_providers/OctoAI_API_examples/).
+
+This tutorial was tested with the following setup:
+* MacBook Pro 16 inch from 2021 with Apple M1 Max and 32GB of RAM running Sonoma 14.3.1
+* OnePlus 12 Android Smartphone with a Snapdragon 8Gen3 SoC and 12GB or RAM, running OxygenOS 14.0
+
+Running Llama3 on a phone will likely require a powerful chipset. We haven't tested extensively the range of chipset that will support this usecase. Feel free to update this README.md to specify what devices were successfully tested.
+
+| Phone      | Chipset          | RAM  | Status  | Comments |
+|------------|------------------|------|---------|----------|
+| OnePlus 12 | Snapdragon 8Gen3 | 12GB | Success | None     |
+|            |                  |      |         |          |
+
+This guide is heavily based on the [MLC Android Guide](https://llm.mlc.ai/docs/deploy/android.html), but several steps have been taken to streamline the instructions.
+
+# Pre-requisites
+
+## Python
+
+Whether you're using conda or virtual env to manage your environment, we highly recommend starting from scratch with a clean new environment.
+
+For instance with virtual environment:
+```bash
+python3 -m venv .venv
+source .venv/bin/activate
+```
+
+Next you'll need to install the following packages:
+```bash
+python3 -m pip install -r requirements.txt
+```
+
+## Rust
+
+[Rust](https://www.rust-lang.org/tools/install) is needed to cross-compile HuggingFace tokenizers to Android.
+Make sure rustc, cargo, and rustup are available in $PATH.
+
+
+## Android Studio
+
+Install Android Studio from <!-- markdown-link-check-disable -->https://developer.android.com/studio<!-- markdown-link-check-enable --> with NDK and CMake.
+
+To install NDK and CMake, in the Android Studio welcome page, click “Projects → SDK Manager → SDK Tools”. Set up the following environment variables:
+
+* ANDROID_NDK so that $ANDROID_NDK/build/cmake/android.toolchain.cmake is available.
+* TVM_NDK_CC that points to NDK's clang compiler.
+
+For instance, the paths will look like the following on OSX for user `moreau`:
+```bash
+# Android + TVM setup
+export ANDROID_NDK="/Users/moreau/Library/Android/sdk/ndk/26.1.10909125"
+export TVM_NDK_CC="$ANDROID_NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android24-clang"
+```
+
+This tutorial was tested successfully on Android Studio Hedgehog | 2023.1.1 Patch 1.
+
+## JDK
+
+JDK, such as OpenJDK >= 17, to compile Java bindings of TVM Unity runtime.
+
+We strongly recommend setting the JAVA_HOME to the JDK bundled with Android Studio. Using Android Studio’s JBR bundle as recommended (<!-- markdown-link-check-disable -->https://developer.android.com/build/jdks<!-- markdown-link-check-enable -->) will reduce the chances of potential errors in JNI compilation.
+
+For instance on macOS, you'll need to point JAVA_HOME to the following.
+
+```bash
+export JAVA_HOME=/Applications/Android\ Studio.app/Contents/jbr/Contents/Home
+```
+
+To make sure the java binary can be found do an `ls $JAVA_HOME/bin/java`
+
+## MLC-LLM
+
+Let's clone mlc-llm from its repo in the directory of your choice:
+
+```bash
+cd /path/to/where/to/clone/repo
+git clone https://github.com/mlc-ai/mlc-llm --recursive
+export MLC_LLM_HOME=/path/to/mlc-llm
+```
+
+At the time of writing this README, we tested `mlc-llm` at the following sha: `21feb7010db02e0c2149489f5972d6a8a796b5a0`.
+
+## Phone Setup
+
+On your phone, enable debugging on your phone in your phone’s developer settings. Each phone manufacturer will have its own approach to enabling debug mode, so a simple Google search should equip you with the steps to do that on your phone.
+
+In addition, make sure to change your USB configuration from "Charging" to "MTP (Media Transfer Protocol)". This will allow us to connect to the device serially.
+
+Connect your phone to your development machine. On OSX, you'll be prompted on the dev machine whether you want to allow the accessory to connect. Hit "Allow".
+
+# Build Steps
+
+## Building the Android Package with MLC
+
+First edit the file under `android/MLCChat/mlc-package-config.json` and with the [mlc-package-config.json](./mlc-package-config.json) in llama-recipes.
+
+To understand what these JSON fields mean you can refer to this [documentation](https://llm.mlc.ai/docs/deploy/android.html#step-2-build-runtime-and-model-libraries).
+
+
+From the `mlc-llm` project root directory:
+
+```bash
+cd $MLC_LLM_HOME
+cd android/MLCChat
+python3 -m mlc_llm package  --package-config mlc-package-config.json --output dist
+```
+
+The command above will take a few minutes to run as it runs through the following steps:
+
+* Compile the Llama 3 8B instruct specified in the `mlc-package-config.json` into a binary model library.
+* Build the `mlc-llm` runtime and tokenizer. In addition to the model itself, a lightweight runtime and tokenizer are required to actually run the LLM.
+
+## Building and Running MLC Chat in Android Studio
+
+Now let's launch Android Studio.
+
+* On the "Welcome to Android Studio" page, hit "Open", and navigate to `$MLC_LLM_HOME/android/MLCChat`, then hit "Open"
+* A window will pop up asking whether to "Trust and Open project 'MLCChat'" - hit "Trust Project"
+* The project will now launch
+* Under File -> Project Structure... -> Project change the Gradle Version (second drop down from the top) to 8.5
+
+Connect your phone to your development machine - assuming you've followed the setup steps in the pre-requisite section, you should be able to see the device.
+
+Next you'll need to:
+
+* Hit Build -> Make Project.
+* Hit Run -> Run 'app'
+
+The MLCChat app will launch on your phone, now access your phone:
+
+* Under Model List you'll see the `Llama-3-8B-Instruct` LLM listed.
+* The model's not quite ready to launch yet, because the weights need to be downloaded over Wifi first. Hit the Download button on the right to the model name to download the weights from HuggingFace.
+
+Note that you can change the build settings to bundle the weights with the MLCChat app so you don't have to download the weights over wifi. To do so you can follow the instructions [here](https://llm.mlc.ai/docs/deploy/android.html#bundle-model-weights).
+
+Once the model weights are downloaded you can now interact with Llama 3 locally on your Android phone!

+ 14 - 0
recipes/inference/mobile_inference/android_inference/mlc-package-config.json

@@ -0,0 +1,14 @@
+{
+    "device": "android",
+    "model_list": [
+        {
+            "model": "HF://mlc-ai/Llama-3-8B-Instruct-q4f16_1-MLC",
+            "estimated_vram_bytes": 4348727787,
+            "model_id": "Llama-3-8B-Instruct",
+            "overrides": {
+                "context_window_size": 768,
+                "prefill_chunk_size": 256
+            }
+        }
+    ]
+}

+ 14 - 0
recipes/inference/mobile_inference/android_inference/requirements.txt

@@ -0,0 +1,14 @@
+--pre
+--find-links https://mlc.ai/wheels
+mlc-llm-nightly
+mlc-ai-nightly
+attrs
+decorator
+numpy
+psutil
+pydantic
+requests
+scipy
+setuptools
+torch
+tqdm

+ 2 - 4
recipes/inference/model_servers/README.md

@@ -1,4 +1,2 @@
-## [Running Llama2 On-Prem with vLLM and TGI](llama-on-prem.md)
-This tutorial shows how to use Llama 2 with [vLLM](https://github.com/vllm-project/vllm) and Hugging Face [TGI](https://github.com/huggingface/text-generation-inference) to build Llama 2 on-prem apps.
-
-\* To run a quantized Llama2 model on iOS and Android, you can use  the open source [MLC LLM](https://github.com/mlc-ai/mlc-llm) or [llama.cpp](https://github.com/ggerganov/llama.cpp). You can even make a Linux OS that boots to Llama2 ([repo](https://github.com/trholding/llama2.c)).
+## [Running Llama 3 On-Prem with vLLM and TGI](llama-on-prem.md)
+This tutorial shows how to use Llama 3 with [vLLM](https://github.com/vllm-project/vllm) and Hugging Face [TGI](https://github.com/huggingface/text-generation-inference) to build Llama 3 on-prem apps.

Файловите разлики са ограничени, защото са твърде много
+ 1 - 1
recipes/inference/model_servers/llama-on-prem.md


+ 21 - 99
recipes/llama_api_providers/Azure_API_example/azure_api_example.ipynb

@@ -4,13 +4,14 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Use Azure API with Llama 2\n",
+    "# Use Azure API with Llama 3\n",
     "\n",
-    "This notebook shows examples of how to use Llama 2 APIs offered by Microsoft Azure. We will cover:  \n",
-    "* HTTP requests API usage for Llama 2 pretrained and chat models in CLI\n",
-    "* HTTP requests API usage for Llama 2 pretrained and chat models in Python\n",
+    "This notebook shows examples of how to use Llama 3 APIs offered by Microsoft Azure. We will cover:  \n",
+    "* HTTP requests API usage for Llama 3 instruct models in CLI\n",
+    "* HTTP requests API usage for Llama 3 instruct models in Python\n",
     "* Plug the APIs into LangChain\n",
     "* Wire the model with Gradio to build a simple chatbot with memory\n",
+    "\n",
     "\n"
    ]
   },
@@ -20,15 +21,13 @@
    "source": [
     "## Prerequisite\n",
     "\n",
-    "Before we start building with Azure Llama 2 APIs, there are certain steps we need to take to deploy the models:\n",
+    "Before we start building with Azure Llama 3 APIs, there are certain steps we need to take to deploy the models:\n",
     "\n",
     "* Register for a valid Azure account with subscription [here](https://azure.microsoft.com/en-us/free/search/?ef_id=_k_CjwKCAiA-P-rBhBEEiwAQEXhH5OHAJLhzzcNsuxwpa5c9EJFcuAjeh6EvZw4afirjbWXXWkiZXmU2hoC5GoQAvD_BwE_k_&OCID=AIDcmm5edswduu_SEM__k_CjwKCAiA-P-rBhBEEiwAQEXhH5OHAJLhzzcNsuxwpa5c9EJFcuAjeh6EvZw4afirjbWXXWkiZXmU2hoC5GoQAvD_BwE_k_&gad_source=1&gclid=CjwKCAiA-P-rBhBEEiwAQEXhH5OHAJLhzzcNsuxwpa5c9EJFcuAjeh6EvZw4afirjbWXXWkiZXmU2hoC5GoQAvD_BwE)\n",
     "* Take a quick look on what is the [Azure AI Studio](https://learn.microsoft.com/en-us/azure/ai-studio/what-is-ai-studio?tabs=home) and navigate to the website from the link in the article\n",
     "* Follow the demos in the article to create a project and [resource](https://learn.microsoft.com/en-us/azure/azure-resource-manager/management/manage-resource-groups-portal) group, or you can also follow the guide [here](https://learn.microsoft.com/en-us/azure/ai-studio/how-to/deploy-models-llama?tabs=azure-studio)\n",
-    "* Select Llama models from Model catalog\n",
-    "* Deploy with \"Pay-as-you-go\"\n",
-    "\n",
-    "Once deployed successfully, you should be assigned for an API endpoint and a security key for inference.  \n",
+    "* For Llama 3 instruct models from Model catalog, click Deploy in the model page and select \"Pay-as-you-go\". Once deployed successfully, you should be assigned for an API endpoint and a security key for inference.\n",
+    "* For Llama 3 pretrained models, Azure currently only support manual deployment under regular subscription. We are working with them to bring \"Pay-as-you-go\" for pretrained models.\n",
     "\n",
     "For more information, you should consult Azure's official documentation [here](https://learn.microsoft.com/en-us/azure/ai-studio/how-to/deploy-models-llama?tabs=azure-studio) for model deployment and inference."
    ]
@@ -41,10 +40,12 @@
     "\n",
     "### Basics\n",
     "\n",
+    "The usage and schema of the API are identical to Llama 3 API hosted on Azure.\n",
+    "\n",
     "For using the REST API, You will need to have an Endpoint url and Authentication Key associated with that endpoint.  \n",
     "This can be acquired from previous steps.  \n",
     "\n",
-    "In this text completion example for pre-trained model, we use a simple curl call for illustration. There are three major components:  \n",
+    "In this chat completion example for instruct model, we use a simple curl call for illustration. There are three major components:  \n",
     "\n",
     "* The `host-url` is your endpoint url with completion schema. \n",
     "* The `headers` defines the content type as well as your api key. \n",
@@ -52,20 +53,9 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!curl -X POST -L https://your-endpoint.inference.ai.azure.com/v1/completions -H 'Content-Type: application/json' -H 'Authorization: your-auth-key' -d '{\"prompt\": \"Math is a\", \"max_tokens\": 30, \"temperature\": 0.7}' "
-   ]
-  },
-  {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "For chat completion, the API schema and request payload are slightly different.\n",
-    "\n",
     "The `host-url` needs to be `/v1/chat/completions` and the request payload to include roles in conversations. Here is a sample payload:  \n",
     "\n",
     "```\n",
@@ -100,18 +90,6 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "If you compare the generation result for both text and chat completion API calls, you will notice that:  \n",
-    "\n",
-    "* Text completion returns a list of `choices` for the input prompt, each contains generated text and completion information such as `logprobs`.\n",
-    "* Chat completion returns a list of `choices` each with a `message` object with completion result, matching the `messages` object in the request.  \n",
-    "\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
     "### Streaming\n",
     "\n",
     "One fantastic feature the API offers is the streaming capability.  \n",
@@ -147,7 +125,7 @@
    "source": [
     "### Content Safety Filtering\n",
     "\n",
-    "All Azure Llama 2 API endpoints have content safety feature turned on. Both input prompt and output tokens are filtered by this service automatically.  \n",
+    "All Azure Llama 3 API endpoints have content safety feature turned on. Both input prompt and output tokens are filtered by this service automatically.  \n",
     "To know more about the impact to the request/response payload, please refer to official guide [here](https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/content-filter?tabs=python).   \n",
     "\n",
     "For model input and output, if the filter detects there is harmful content, the generation will error out with a response payload containing the reasoning, along with information on the type of content violation and its severity. \n",
@@ -172,7 +150,7 @@
     "\n",
     "Besides calling the API directly from command line tools, you can also programatically call them in Python.  \n",
     "\n",
-    "Here is an example for the text completion model:\n",
+    "Here is an example for the instruct model:\n",
     "\n",
     "\n"
    ]
@@ -187,53 +165,6 @@
     "import json\n",
     "\n",
     "#Configure payload data sending to API endpoint\n",
-    "data = {\"prompt\": \"Math is a\", \n",
-    "         \"max_tokens\": 30, \n",
-    "         \"temperature\": 0.7,\n",
-    "         \"top_p\": 0.9,      \n",
-    "}\n",
-    "\n",
-    "body = str.encode(json.dumps(data))\n",
-    "\n",
-    "#Replace the url with your API endpoint\n",
-    "url = 'https://your-endpoint.inference.ai.azure.com/v1/completions'\n",
-    "\n",
-    "#Replace this with the key for the endpoint\n",
-    "api_key = 'your-auth-key'\n",
-    "if not api_key:\n",
-    "    raise Exception(\"API Key is missing\")\n",
-    "\n",
-    "headers = {'Content-Type':'application/json', 'Authorization':(api_key)}\n",
-    "req = urllib.request.Request(url, body, headers)\n",
-    "\n",
-    "try:\n",
-    "    response = urllib.request.urlopen(req)\n",
-    "    result = response.read()\n",
-    "    print(result)\n",
-    "except urllib.error.HTTPError as error:\n",
-    "    print(\"The request failed with status code: \" + str(error.code))\n",
-    "    # Print the headers - they include the requert ID and the timestamp, which are useful for debugging the failure\n",
-    "    print(error.info())\n",
-    "    print(error.read().decode(\"utf8\", 'ignore'))\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Chat completion in Python is very similar, here is a quick example:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import urllib.request\n",
-    "import json\n",
-    "\n",
-    "#Configure payload data sending to API endpoint\n",
     "data = {\"messages\":[\n",
     "            {\"role\":\"system\", \"content\":\"You are a helpful assistant.\"},\n",
     "            {\"role\":\"user\", \"content\":\"Who wrote the book Innovators dilemma?\"}], \n",
@@ -323,14 +254,12 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Use Llama 2 API with LangChain\n",
+    "## Use Llama 3 API with LangChain\n",
     "\n",
-    "In this section, we will demonstrate how to use Llama 2 APIs with LangChain, one of the most popular framework to accelerate building your AI product.  \n",
+    "In this section, we will demonstrate how to use Llama 3 APIs with LangChain, one of the most popular framework to accelerate building your AI product.  \n",
     "One common solution here is to create your customized LLM instance, so you can add it to various chains to complete different tasks.  \n",
     "In this example, we will use the `AzureMLOnlineEndpoint` class LangChain provides to build a customized LLM instance. This particular class is designed to take in Azure endpoint and API keys as inputs and wire it with HTTP calls. So the underlying of it is very similar to how we used `urllib.request` library to send RESTful calls in previous examples to the Azure Endpoint.   \n",
     "\n",
-    "Note Azure is working on a standard solution for LangChain integration in this [PR](https://github.com/langchain-ai/langchain/pull/14560), you should consider migrating to that in the future. \n",
-    "\n",
     "First, let's install dependencies: \n",
     "\n"
    ]
@@ -363,7 +292,7 @@
     "\n",
     "\n",
     "class AzureLlamaAPIContentFormatter(ContentFormatterBase):\n",
-    "#Content formatter for Llama 2 API for Azure MaaS\n",
+    "#Content formatter for Llama 3 API for Azure MaaS\n",
     "\n",
     "    def format_request_payload(self, prompt: str, model_kwargs: Dict) -> bytes:\n",
     "        #Formats the request according to the chosen api\n",
@@ -450,18 +379,11 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "At the time of writing this sample notebook, LangChain doesn't support streaming with `AzureMLOnlineEndpoint` for Llama 2. We are working with LangChain and Azure team to implement that."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Build a chatbot with Llama 2 API\n",
+    "## Build a chatbot with Llama 3 API\n",
     "\n",
-    "In this section, we will build a simple chatbot using Azure Llama 2 API, LangChain and [Gradio](https://www.gradio.app/)'s `ChatInterface` with memory capability.\n",
+    "In this section, we will build a simple chatbot using Azure Llama 3 API, LangChain and [Gradio](https://www.gradio.app/)'s `ChatInterface` with memory capability.\n",
     "\n",
-    "Gradio is a framework to help demo your machine learning model with a web interface. We also have a dedicated Gradio chatbot [example](https://github.com/meta-llama/llama-recipes/blob/main/recipes/use_cases/chatbots/RAG_chatbot/RAG_Chatbot_Example.ipynb) built with Llama 2 on-premises with RAG.   \n",
+    "Gradio is a framework to help demo your machine learning model with a web interface. We also have a dedicated Gradio chatbot [example](https://github.com/meta-llama/llama-recipes/blob/main/recipes/use_cases/chatbots/RAG_chatbot/RAG_Chatbot_Example.ipynb) built with Llama 3 on-premises with RAG.   \n",
     "\n",
     "First, let's install Gradio dependencies.\n"
    ]
@@ -508,7 +430,7 @@
     "langchain.debug=True\n",
     "\n",
     "class AzureLlamaAPIContentFormatter(ContentFormatterBase):\n",
-    "#Content formatter for Llama 2 API for Azure MaaS\n",
+    "#Content formatter for Llama 3 API for Azure MaaS\n",
     "\n",
     "    def format_request_payload(self, prompt: str, model_kwargs: Dict) -> bytes:\n",
     "        #Formats the request according to the chosen api\n",
@@ -602,7 +524,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.10"
+   "version": "3.9.6"
   }
  },
  "nbformat": 4,

+ 89 - 109
recipes/llama_api_providers/OctoAI_API_examples/Getting_to_know_Llama.ipynb

@@ -6,8 +6,43 @@
     "id": "LERqQn5v8-ak"
    },
    "source": [
-    "# **Getting to know Llama 2: Everything you need to start building**\n",
-    "Our goal in this session is to provide a guided tour of Llama 2, including understanding different Llama 2 models, how and where to access them, Generative AI and Chatbot architectures, prompt engineering, RAG (Retrieval Augmented Generation), Fine-tuning and more. All this is implemented with a starter code for you to take it and use it in your Llama 2 projects."
+    "# **Getting to know Llama 3: Everything you need to start building**\n",
+    "Our goal in this session is to provide a guided tour of Llama 3, including understanding different Llama 3 models, how and where to access them, Generative AI and Chatbot architectures, prompt engineering, RAG (Retrieval Augmented Generation), Fine-tuning and more. All this is implemented with a starter code for you to take it and use it in your Llama 3 projects."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "h3YGMDJidHtH"
+   },
+   "source": [
+    "### **Install dependencies**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "VhN6hXwx7FCp"
+   },
+   "outputs": [],
+   "source": [
+    "# Install dependencies and initialize\n",
+    "%pip install \\\n",
+    "    langchain==0.1.19 \\\n",
+    "    matplotlib \\\n",
+    "    octoai-sdk==0.10.1 \\\n",
+    "    openai \\\n",
+    "    sentence_transformers \\\n",
+    "    pdf2image \\\n",
+    "    pdfminer \\\n",
+    "    pdfminer.six \\\n",
+    "    unstructured \\\n",
+    "    faiss-cpu \\\n",
+    "    pillow-heif \\\n",
+    "    opencv-python \\\n",
+    "    unstructured-inference \\\n",
+    "    pikepdf"
    ]
   },
   {
@@ -58,7 +93,7 @@
     "    A[Users] --> B(Applications e.g. mobile, web)\n",
     "    B --> |Hosted API|C(Platforms e.g. Custom, OctoAI, HuggingFace, Replicate)\n",
     "    B -- optional --> E(Frameworks e.g. LangChain)\n",
-    "    C-->|User Input|D[Llama 2]\n",
+    "    C-->|User Input|D[Llama 3]\n",
     "    D-->|Model Output|C\n",
     "    E --> C\n",
     "    classDef default fill:#CCE6FF,stroke:#84BCF5,textColor:#1C2B33,fontFamily:trebuchet ms;\n",
@@ -69,19 +104,15 @@
     "  flowchart TD\n",
     "    A[User Prompts] --> B(Frameworks e.g. LangChain)\n",
     "    B <--> |Database, Docs, XLS|C[fa:fa-database External Data]\n",
-    "    B -->|API|D[Llama 2]\n",
+    "    B -->|API|D[Llama 3]\n",
     "    classDef default fill:#CCE6FF,stroke:#84BCF5,textColor:#1C2B33,fontFamily:trebuchet ms;\n",
     "  \"\"\")\n",
     "\n",
-    "def llama2_family():\n",
+    "def llama3_family():\n",
     "  mm(\"\"\"\n",
     "  graph LR;\n",
-    "      llama-2 --> llama-2-7b\n",
-    "      llama-2 --> llama-2-13b\n",
-    "      llama-2 --> llama-2-70b\n",
-    "      llama-2-7b --> llama-2-7b-chat\n",
-    "      llama-2-13b --> llama-2-13b-chat\n",
-    "      llama-2-70b --> llama-2-70b-chat\n",
+    "      llama-3 --> llama-3-8b-instruct\n",
+    "      llama-3 --> llama-3-70b-instruct\n",
     "      classDef default fill:#CCE6FF,stroke:#84BCF5,textColor:#1C2B33,fontFamily:trebuchet ms;\n",
     "  \"\"\")\n",
     "\n",
@@ -91,7 +122,7 @@
     "    users --> apps\n",
     "    apps --> frameworks\n",
     "    frameworks --> platforms\n",
-    "    platforms --> Llama 2\n",
+    "    platforms --> Llama 3\n",
     "    classDef default fill:#CCE6FF,stroke:#84BCF5,textColor:#1C2B33,fontFamily:trebuchet ms;\n",
     "  \"\"\")\n",
     "\n",
@@ -115,8 +146,8 @@
     "  user --> prompt\n",
     "  prompt --> i_safety\n",
     "  i_safety --> context\n",
-    "  context --> Llama_2\n",
-    "  Llama_2 --> output\n",
+    "  context --> Llama_3\n",
+    "  Llama_3 --> output\n",
     "  output --> o_safety\n",
     "  i_safety --> memory\n",
     "  o_safety --> memory\n",
@@ -165,7 +196,7 @@
     "id": "i4Np_l_KtIno"
    },
    "source": [
-    "##**1 - Understanding Llama 2**"
+    "##**1 - Understanding Llama 3**"
    ]
   },
   {
@@ -174,14 +205,13 @@
     "id": "PGPSI3M5PGTi"
    },
    "source": [
-    "### **1.1 - What is Llama 2?**\n",
+    "### **1.1 - What is Llama 3?**\n",
     "\n",
     "* State of the art (SOTA), Open Source LLM\n",
-    "* 7B, 13B, 70B\n",
+    "* Llama 3 8B, 70B\n",
     "* Pretrained + Chat\n",
     "* Choosing model: Size, Quality, Cost, Speed\n",
-    "* [Research paper](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/)\n",
-    "\n",
+    "* [Llama 3 blog](https://ai.meta.com/blog/meta-llama-3/)\n",
     "* [Responsible use guide](https://ai.meta.com/llama/responsible-use-guide/)"
    ]
   },
@@ -208,7 +238,7 @@
    },
    "outputs": [],
    "source": [
-    "llama2_family()"
+    "llama3_family()"
    ]
   },
   {
@@ -217,11 +247,10 @@
     "id": "aYeHVVh45bdT"
    },
    "source": [
-    "###**1.2 - Accessing Llama 2**\n",
+    "###**1.2 - Accessing Llama 3**\n",
     "* Download + Self Host (on-premise)\n",
     "* Hosted API Platform (e.g. [OctoAI](https://octoai.cloud/), [Replicate](https://replicate.com/meta))\n",
-    "* Hosted Container Platform (e.g. [Azure](https://techcommunity.microsoft.com/t5/ai-machine-learning-blog/introducing-llama-2-on-azure/ba-p/3881233), [AWS](https://aws.amazon.com/blogs/machine-learning/llama-2-foundation-models-from-meta-are-now-available-in-amazon-sagemaker-jumpstart/), [GCP](https://console.cloud.google.com/vertex-ai/publishers/google/model-garden/139))\n",
-    "\n"
+    "* Hosted Container Platform (e.g. [Azure](https://techcommunity.microsoft.com/t5/ai-machine-learning-blog/introducing-llama-2-on-azure/ba-p/3881233), [AWS](https://aws.amazon.com/blogs/machine-learning/llama-2-foundation-models-from-meta-are-now-available-in-amazon-sagemaker-jumpstart/), [GCP](https://console.cloud.google.com/vertex-ai/publishers/google/model-garden/139))"
    ]
   },
   {
@@ -230,7 +259,7 @@
     "id": "kBuSay8vtzL4"
    },
    "source": [
-    "### **1.3 - Use Cases of Llama 2**\n",
+    "### **1.3 - Use Cases of Llama 3**\n",
     "* Content Generation\n",
     "* Chatbots\n",
     "* Summarization\n",
@@ -245,42 +274,9 @@
     "id": "sd54g0OHuqBY"
    },
    "source": [
-    "##**2 - Using Llama 2**\n",
+    "##**2 - Using Llama 3**\n",
     "\n",
-    "In this notebook, we are going to access [Llama 13b chat model](https://octoai.cloud/tools/text/chat?mode=demo&model=llama-2-13b-chat-fp16) using hosted API from OctoAI."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "h3YGMDJidHtH"
-   },
-   "source": [
-    "### **2.1 - Install dependencies**"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "VhN6hXwx7FCp"
-   },
-   "outputs": [],
-   "source": [
-    "# Install dependencies and initialize\n",
-    "%pip install -qU \\\n",
-    "    octoai-sdk \\\n",
-    "    langchain \\\n",
-    "    sentence_transformers \\\n",
-    "    pdf2image \\\n",
-    "    pdfminer \\\n",
-    "    pdfminer.six \\\n",
-    "    unstructured \\\n",
-    "    faiss-cpu \\\n",
-    "    pillow-heif \\\n",
-    "    opencv-python \\\n",
-    "    unstructured-inference \\\n",
-    "    pikepdf"
+    "In this notebook, we are going to access [Llama 3 8b instruct model](https://octoai.cloud/text/chat?model=meta-llama-3-8b-instruct&mode=api) using hosted API from OctoAI."
    ]
   },
   {
@@ -292,9 +288,9 @@
    "outputs": [],
    "source": [
     "# model on OctoAI platform that we will use for inferencing\n",
-    "# We will use llama 13b chat model hosted on OctoAI server ()\n",
+    "# We will use llama 3 8b instruct model hosted on OctoAI server\n",
     "\n",
-    "llama2_13b = \"llama-2-13b-chat-fp16\""
+    "llama3_8b = \"meta-llama-3-8b-instruct\""
    ]
   },
   {
@@ -326,21 +322,21 @@
    },
    "outputs": [],
    "source": [
-    "# we will use OctoAI's hosted API\n",
-    "from octoai.client import Client\n",
+    "# We will use OpenAI's APIs to talk to OctoAI's hosted model endpoint\n",
+    "from openai import OpenAI\n",
     "\n",
-    "client = Client(OCTOAI_API_TOKEN)\n",
+    "client = OpenAI(\n",
+    "   base_url = \"https://text.octoai.run/v1\",\n",
+    "   api_key = os.environ[\"OCTOAI_API_TOKEN\"]\n",
+    ")\n",
     "\n",
     "# text completion with input prompt\n",
     "def Completion(prompt):\n",
     "    output = client.chat.completions.create(\n",
     "        messages=[\n",
-    "            {\n",
-    "                \"role\": \"user\",\n",
-    "                \"content\": prompt\n",
-    "            }\n",
+    "            {\"role\": \"user\", \"content\": prompt}\n",
     "        ],\n",
-    "        model=\"llama-2-13b-chat-fp16\",\n",
+    "        model=llama3_8b,\n",
     "        max_tokens=1000\n",
     "    )\n",
     "    return output.choices[0].message.content\n",
@@ -349,16 +345,10 @@
     "def ChatCompletion(prompt, system_prompt=None):\n",
     "    output = client.chat.completions.create(\n",
     "        messages=[\n",
-    "            {\n",
-    "                \"role\": \"system\",\n",
-    "                \"content\": system_prompt\n",
-    "            },\n",
-    "            {\n",
-    "                \"role\": \"user\",\n",
-    "                \"content\": prompt\n",
-    "            }\n",
+    "            {\"role\": \"system\", \"content\": system_prompt},\n",
+    "            {\"role\": \"user\", \"content\": prompt}\n",
     "        ],\n",
-    "        model=\"llama-2-13b-chat-fp16\",\n",
+    "        model=llama3_8b,\n",
     "        max_tokens=1000\n",
     "    )\n",
     "    return output.choices[0].message.content"
@@ -370,7 +360,7 @@
     "id": "5Jxq0pmf6L73"
    },
    "source": [
-    "### **2.2 - Basic completion**"
+    "# **2.1 - Basic completion**"
    ]
   },
   {
@@ -391,7 +381,7 @@
     "id": "StccjUDh6W0Q"
    },
    "source": [
-    "### **2.3 - System prompts**\n"
+    "## **2.2 - System prompts**\n"
    ]
   },
   {
@@ -415,7 +405,7 @@
     "id": "Hp4GNa066pYy"
    },
    "source": [
-    "### **2.4 - Response formats**\n",
+    "### **2.3 - Response formats**\n",
     "* Can support different formatted outputs e.g. text, JSON, etc."
    ]
   },
@@ -483,7 +473,7 @@
     "\n",
     "* User Prompts\n",
     "* Input Safety\n",
-    "* Llama 2\n",
+    "* Llama 3\n",
     "* Output Safety\n",
     "\n",
     "* Memory & Context"
@@ -743,12 +733,9 @@
     "### **4.3 - Retrieval Augmented Generation (RAG)**\n",
     "* Prompt Eng Limitations - Knowledge cutoff & lack of specialized data\n",
     "\n",
-    "* Retrieval Augmented Generation(RAG) allows us to retrieve snippets of information from external data sources and augment it to the user's prompt to get tailored responses from Llama 2.\n",
-    "\n",
-    "For our demo, we are going to download an external PDF file from a URL and query against the content in the pdf file to get contextually relevant information back with the help of Llama!\n",
+    "* Retrieval Augmented Generation(RAG) allows us to retrieve snippets of information from external data sources and augment it to the user's prompt to get tailored responses from Llama 3.\n",
     "\n",
-    "\n",
-    "\n"
+    "For our demo, we are going to download an external PDF file from a URL and query against the content in the pdf file to get contextually relevant information back with the help of Llama!"
    ]
   },
   {
@@ -797,24 +784,16 @@
    "source": [
     "# langchain setup\n",
     "from langchain.llms.octoai_endpoint import OctoAIEndpoint\n",
-    "# Use the Llama 2 model hosted on OctoAI\n",
-    "# Temperature: Adjusts randomness of outputs, greater than 1 is random and 0 is deterministic, 0.75 is a good starting value\n",
+    "\n",
+    "# Use the Llama 3 model hosted on OctoAI\n",
+    "# max_tokens: Maximum number of tokens to generate. A word is generally 2-3 tokens\n",
+    "# temperature: Adjusts randomness of outputs, greater than 1 is random and 0 is deterministic, 0.75 is a good starting value\n",
     "# top_p: When decoding text, samples from the top p percentage of most likely tokens; lower to ignore less likely tokens\n",
-    "# max_new_tokens: Maximum number of tokens to generate. A word is generally 2-3 tokens\n",
     "llama_model = OctoAIEndpoint(\n",
-    "    endpoint_url=\"https://text.octoai.run/v1/chat/completions\",\n",
-    "    model_kwargs={\n",
-    "        \"model\": llama2_13b,\n",
-    "        \"messages\": [\n",
-    "            {\n",
-    "                \"role\": \"system\",\n",
-    "                \"content\": \"You are a helpful, respectful and honest assistant.\"\n",
-    "            }\n",
-    "        ],\n",
-    "        \"max_tokens\": 1000,\n",
-    "        \"top_p\": 1,\n",
-    "        \"temperature\": 0.75\n",
-    "    },\n",
+    "    model=llama3_8b,\n",
+    "    max_tokens=1000,\n",
+    "    temperature=0.75,\n",
+    "    top_p=1\n",
     ")"
    ]
   },
@@ -973,10 +952,11 @@
    },
    "source": [
     "#### **Resources**\n",
-    "- [GitHub - Llama 2](https://github.com/facebookresearch/llama)\n",
-    "- [Github - LLama 2 Recipes](https://github.com/facebookresearch/llama-recipes)\n",
-    "- [Llama 2](https://ai.meta.com/llama/)\n",
-    "- [Research Paper](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/)\n",
+    "- [GitHub - Llama](https://github.com/facebookresearch/llama)\n",
+    "- [Github - LLama Recipes](https://github.com/facebookresearch/llama-recipes)\n",
+    "- [Llama](https://ai.meta.com/llama/)\n",
+    "- [Research Paper on Llama 2](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/)\n",
+    "- [Llama 3 Page](https://ai.meta.com/blog/meta-llama-3/)\n",
     "- [Model Card](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md)\n",
     "- [Responsible Use Guide](https://ai.meta.com/llama/responsible-use-guide/)\n",
     "- [Acceptable Use Policy](https://ai.meta.com/llama/use-policy/)\n",
@@ -992,9 +972,9 @@
    "source": [
     "#### **Authors & Contact**\n",
     "  * asangani@meta.com, [Amit Sangani | LinkedIn](https://www.linkedin.com/in/amitsangani/)\n",
-    "  * mohsena@meta.com, [Mohsen Agsen | LinkedIn](https://www.linkedin.com/in/mohsen-agsen-62a9791/)\n",
+    "  * mohsena@meta.com, [Mohsen Agsen | LinkedIn](https://www.linkedin.com/in/dr-thierry-moreau/)\n",
     "\n",
-    "Adapted to run on OctoAI by Thierry Moreau - tmoreau@octo.ai"
+    "Adapted to run on OctoAI and use Llama 3 by tmoreau@octo.ai [Thierry Moreay | LinkedIn]()"
    ]
   }
  ],

+ 24 - 34
recipes/llama_api_providers/OctoAI_API_examples/HelloLlamaCloud.ipynb

@@ -6,13 +6,12 @@
    "metadata": {},
    "source": [
     "## This demo app shows:\n",
-    "* How to run Llama2 in the cloud hosted on OctoAI\n",
+    "* How to run Llama 3 in the cloud hosted on OctoAI\n",
     "* How to use LangChain to ask Llama general questions and follow up questions\n",
-    "* How to use LangChain to load a recent PDF doc - the Llama2 paper pdf - and chat about it. This is the well known RAG (Retrieval Augmented Generation) method to let LLM such as Llama2 be able to answer questions about the data not publicly available when Llama2 was trained, or about your own data. RAG is one way to prevent LLM's hallucination\n",
-    "* You should also review the [HelloLlamaLocal](HelloLlamaLocal.ipynb) notebook for more information on RAG\n",
+    "* How to use LangChain to load a recent PDF doc - the Llama paper pdf - and chat about it. This is the well known RAG (Retrieval Augmented Generation) method to let LLM such as Llama be able to answer questions about your own data. RAG is one way to prevent LLM's hallucination\n",
     "\n",
     "**Note** We will be using OctoAI to run the examples here. You will need to first sign into [OctoAI](https://octoai.cloud/) with your Github or Google account, then create a free API token [here](https://octo.ai/docs/getting-started/how-to-create-an-octoai-access-token) that you can use for a while (a month or $10 in OctoAI credits, whichever one runs out first).\n",
-    "After the free trial ends, you will need to enter billing info to continue to use Llama2 hosted on OctoAI."
+    "After the free trial ends, you will need to enter billing info to continue to use Llama 3 hosted on OctoAI."
    ]
   },
   {
@@ -35,7 +34,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!pip install langchain octoai-sdk sentence-transformers chromadb pypdf"
+    "%pip install langchain==0.1.19 octoai-sdk==0.10.1 openai sentence-transformers chromadb pypdf"
    ]
   },
   {
@@ -57,15 +56,17 @@
    "id": "3e8870c1",
    "metadata": {},
    "source": [
-    "Next we call the Llama 2 model from OctoAI. In this example we will use the Llama 2 13b chat FP16 model. You can find more on Llama 2 models on the [OctoAI text generation solution page](https://octoai.cloud/tools/text).\n",
+    "Next we call the Llama 3 model from OctoAI. In this example we will use the Llama 3 8b instruct model. You can find more on Llama models on the [OctoAI text generation solution page](https://octoai.cloud/text).\n",
     "\n",
     "At the time of writing this notebook the following Llama models are available on OctoAI:\n",
-    "* llama-2-13b-chat\n",
-    "* llama-2-70b-chat\n",
+    "* meta-llama-3-8b-instruct\n",
+    "* meta-llama-3-70b-instruct\n",
     "* codellama-7b-instruct\n",
     "* codellama-13b-instruct\n",
     "* codellama-34b-instruct\n",
-    "* codellama-70b-instruct"
+    "* llama-2-13b-chat\n",
+    "* llama-2-70b-chat\n",
+    "* llamaguard-7b"
    ]
   },
   {
@@ -77,21 +78,11 @@
    "source": [
     "from langchain.llms.octoai_endpoint import OctoAIEndpoint\n",
     "\n",
-    "llama2_13b = \"llama-2-13b-chat-fp16\"\n",
+    "llama3_8b = \"meta-llama-3-8b-instruct\"\n",
     "llm = OctoAIEndpoint(\n",
-    "    endpoint_url=\"https://text.octoai.run/v1/chat/completions\",\n",
-    "    model_kwargs={\n",
-    "        \"model\": llama2_13b,\n",
-    "        \"messages\": [\n",
-    "            {\n",
-    "                \"role\": \"system\",\n",
-    "                \"content\": \"You are a helpful, respectful and honest assistant.\"\n",
-    "            }\n",
-    "        ],\n",
-    "        \"max_tokens\": 500,\n",
-    "        \"top_p\": 1,\n",
-    "        \"temperature\": 0.01\n",
-    "    },\n",
+    "    model=llama3_8b,\n",
+    "    max_tokens=500,\n",
+    "    temperature=0.01\n",
     ")"
    ]
   },
@@ -111,7 +102,7 @@
    "outputs": [],
    "source": [
     "question = \"who wrote the book Innovator's dilemma?\"\n",
-    "answer = llm(question)\n",
+    "answer = llm.invoke(question)\n",
     "print(answer)"
    ]
   },
@@ -134,7 +125,7 @@
    "source": [
     "# chat history not passed so Llama doesn't have the context and doesn't know this is more about the book\n",
     "followup = \"tell me more\"\n",
-    "followup_answer = llm(followup)\n",
+    "followup_answer = llm.invoke(followup)\n",
     "print(followup_answer)"
    ]
   },
@@ -162,7 +153,7 @@
     "memory = ConversationBufferMemory()\n",
     "conversation = ConversationChain(\n",
     "    llm=llm, \n",
-    "    memory = memory,\n",
+    "    memory=memory,\n",
     "    verbose=False\n",
     ")"
    ]
@@ -208,11 +199,10 @@
    "id": "fc436163",
    "metadata": {},
    "source": [
-    "Next, let's explore using Llama 2 to answer questions using documents for context. \n",
-    "This gives us the ability to update Llama 2's knowledge thus giving it better context without needing to finetune. \n",
-    "For a more in-depth study of this, see the notebook on using Llama 2 locally [here](HelloLlamaLocal.ipynb)\n",
+    "Next, let's explore using Llama 3 to answer questions using documents for context. \n",
+    "This gives us the ability to update Llama 3's knowledge thus giving it better context without needing to finetune. \n",
     "\n",
-    "We will use the PyPDFLoader to load in a pdf, in this case, the Llama 2 paper."
+    "We will use the PyPDFLoader to load in a pdf, in this case, the Llama paper."
    ]
   },
   {
@@ -301,7 +291,7 @@
    "id": "54ad02d7",
    "metadata": {},
    "source": [
-    "We then use ` RetrievalQA` to retrieve the documents from the vector database and give the model more context on Llama 2, thereby increasing its knowledge.\n",
+    "We then use ` RetrievalQA` to retrieve the documents from the vector database and give the model more context on Llama, thereby increasing its knowledge.\n",
     "\n",
     "For each question, LangChain performs a semantic similarity search of it in the vector db, then passes the search results as the context to Llama to answer the question."
    ]
@@ -321,7 +311,7 @@
     "    retriever=vectordb.as_retriever()\n",
     ")\n",
     "\n",
-    "question = \"What is llama2?\"\n",
+    "question = \"What is llama?\"\n",
     "result = qa_chain({\"query\": question})\n",
     "print(result['result'])"
    ]
@@ -344,7 +334,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# no context passed so Llama2 doesn't have enough context to answer so it lets its imagination go wild\n",
+    "# no context passed so Llama doesn't have enough context to answer so it lets its imagination go wild\n",
     "result = qa_chain({\"query\": \"what are its use cases?\"})\n",
     "print(result['result'])"
    ]
@@ -376,7 +366,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# let's ask the original question \"What is llama2?\" again\n",
+    "# let's ask the original question \"What is llama?\" again\n",
     "result = chat_chain({\"question\": question, \"chat_history\": []})\n",
     "print(result['answer'])"
    ]

+ 67 - 143
recipes/llama_api_providers/OctoAI_API_examples/LiveData.ipynb

@@ -7,12 +7,12 @@
    "source": [
     "## This demo app shows:\n",
     "* How to use LlamaIndex, an open source library to help you build custom data augmented LLM applications\n",
-    "* How to ask Llama questions about recent live data via the You.com live search API and LlamaIndex\n",
+    "* How to ask Llama 3 questions about recent live data via the Tavily live search API\n",
     "\n",
-    "The LangChain package is used to facilitate the call to Llama2 hosted on OctoAI\n",
+    "The LangChain package is used to facilitate the call to Llama 3 hosted on OctoAI\n",
     "\n",
     "**Note** We will be using OctoAI to run the examples here. You will need to first sign into [OctoAI](https://octoai.cloud/) with your Github or Google account, then create a free API token [here](https://octo.ai/docs/getting-started/how-to-create-an-octoai-access-token) that you can use for a while (a month or $10 in OctoAI credits, whichever one runs out first).\n",
-    "After the free trial ends, you will need to enter billing info to continue to use Llama2 hosted on OctoAI."
+    "After the free trial ends, you will need to enter billing info to continue to use Llama3 hosted on OctoAI."
    ]
   },
   {
@@ -32,23 +32,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!pip install llama-index langchain"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "21fe3849",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# use ServiceContext to configure the LLM used and the custom embeddings\n",
-    "from llama_index import ServiceContext\n",
-    "\n",
-    "# VectorStoreIndex is used to index custom data \n",
-    "from llama_index import VectorStoreIndex\n",
-    "\n",
-    "from langchain.llms.octoai_endpoint import OctoAIEndpoint"
+    "!pip install llama-index \n",
+    "!pip install llama-index-core\n",
+    "!pip install llama-index-llms-octoai\n",
+    "!pip install llama-index-embeddings-octoai\n",
+    "!pip install octoai-sdk\n",
+    "!pip install tavily-python\n",
+    "!pip install replicate"
    ]
   },
   {
@@ -75,227 +65,161 @@
   },
   {
    "cell_type": "markdown",
-   "id": "f8ff812b",
-   "metadata": {},
-   "source": [
-    "In this example we will use the [YOU.com](https://you.com/) search engine to augment the LLM's responses.\n",
-    "To use the You.com Search API, you can email api@you.com to request an API key. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "75275628-5235-4b55-8033-601c76107528",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "YOUCOM_API_KEY = getpass()\n",
-    "os.environ[\"YOUCOM_API_KEY\"] = YOUCOM_API_KEY"
-   ]
-  },
-  {
-   "cell_type": "markdown",
    "id": "cb210c7c",
    "metadata": {},
    "source": [
-    "We then call the Llama 2 model from OctoAI.\n",
+    "We then call the Llama 3 model from OctoAI.\n",
     "\n",
-    "We will use the Llama 2 13b chat FP16 model. You can find more on Llama 2 models on the [OctoAI text generation solution page](https://octoai.cloud/tools/text).\n",
+    "We will use the Llama 3 8b instruct model. You can find more on Llama models on the [OctoAI text generation solution page](https://octoai.cloud/text).\n",
     "\n",
     "At the time of writing this notebook the following Llama models are available on OctoAI:\n",
-    "* llama-2-13b-chat\n",
-    "* llama-2-70b-chat\n",
+    "* meta-llama-3-8b-instruct\n",
+    "* meta-llama-3-70b-instruct\n",
     "* codellama-7b-instruct\n",
     "* codellama-13b-instruct\n",
     "* codellama-34b-instruct\n",
-    "* codellama-70b-instruct"
+    "* llama-2-13b-chat\n",
+    "* llama-2-70b-chat\n",
+    "* llamaguard-7b"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "c12fc2cb",
+   "id": "21fe3849",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# set llm to be using Llama2 hosted on OctoAI\n",
-    "llama2_13b = \"llama-2-13b-chat-fp16\"\n",
+    "# use ServiceContext to configure the LLM used and the custom embeddings\n",
+    "from llama_index.core import ServiceContext\n",
     "\n",
-    "llm = OctoAIEndpoint(\n",
-    "    endpoint_url=\"https://text.octoai.run/v1/chat/completions\",\n",
-    "    model_kwargs={\n",
-    "        \"model\": llama2_13b,\n",
-    "        \"messages\": [\n",
-    "            {\n",
-    "                \"role\": \"system\",\n",
-    "                \"content\": \"You are a helpful, respectful and honest assistant.\"\n",
-    "            }\n",
-    "        ],\n",
-    "        \"max_tokens\": 500,\n",
-    "        \"top_p\": 1,\n",
-    "        \"temperature\": 0.01\n",
-    "    },\n",
-    ")"
+    "# VectorStoreIndex is used to index custom data \n",
+    "from llama_index.core import VectorStoreIndex\n",
+    "\n",
+    "from llama_index.core import Settings, VectorStoreIndex\n",
+    "from llama_index.embeddings.octoai import OctoAIEmbedding\n",
+    "from llama_index.llms.octoai import OctoAI\n",
+    "\n",
+    "Settings.llm = OctoAI(\n",
+    "    model=\"meta-llama-3-8b-instruct\",\n",
+    "    token=OCTOAI_API_TOKEN,\n",
+    "    temperature=0.0,\n",
+    "    max_tokens=128,\n",
+    ")\n",
+    "\n",
+    "Settings.embed_model = OctoAIEmbedding(api_key=OCTOAI_API_TOKEN)"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "476d72da",
+   "id": "f8ff812b",
    "metadata": {},
    "source": [
-    "Using our api key we set up earlier, we make a request from YOU.com for live data on a particular topic."
+    "Next you will use the [Tavily](https://tavily.com/) search engine to augment the Llama 3's responses. To create a free trial Tavily Search API, sign in with your Google or Github account [here](https://app.tavily.com/sign-in)."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "effc9656-b18d-4d24-a80b-6066564a838b",
+   "id": "75275628-5235-4b55-8033-601c76107528",
    "metadata": {},
    "outputs": [],
    "source": [
-    "import requests\n",
+    "from tavily import TavilyClient\n",
     "\n",
-    "query = \"Meta Connect\" # you can try other live data query about sports score, stock market and weather info \n",
-    "headers = {\"X-API-Key\": os.environ[\"YOUCOM_API_KEY\"]}\n",
-    "data = requests.get(\n",
-    "    f\"https://api.ydc-index.io/search?query={query}\",\n",
-    "    headers=headers,\n",
-    ").json()"
+    "TAVILY_API_KEY = getpass()\n",
+    "tavily = TavilyClient(api_key=TAVILY_API_KEY)"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "8bed3baf-742e-473c-ada1-4459012a8a2c",
+   "cell_type": "markdown",
+   "id": "476d72da",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "# check the query result in JSON\n",
-    "import json\n",
-    "\n",
-    "print(json.dumps(data, indent=2))"
+    "Do a live web search on \"Llama 3 fine-tuning\"."
    ]
   },
   {
-   "cell_type": "markdown",
-   "id": "b196e697",
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "effc9656-b18d-4d24-a80b-6066564a838b",
    "metadata": {},
+   "outputs": [],
    "source": [
-    "We then use the [`JSONLoader`](https://llamahub.ai/l/file-json) to extract the text from the returned data. The `JSONLoader` gives us the ability to load the data into LamaIndex.\n",
-    "In the next cell we show how to load the JSON result with key info stored as \"snippets\".\n",
-    "\n",
-    "However, you can also add the snippets in the query result to documents like below:\n",
-    "```python \n",
-    "from llama_index import Document\n",
-    "snippets = [snippet for hit in data[\"hits\"] for snippet in hit[\"snippets\"]]\n",
-    "documents = [Document(text=s) for s in snippets]\n",
-    "```\n",
-    "This can be handy if you just need to add a list of text strings to doc"
+    "response = tavily.search(query=\"Llama 3 fine-tuning\")\n",
+    "context = [{\"url\": obj[\"url\"], \"content\": obj[\"content\"]} for obj in response['results']]"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "7c40e73f-ca13-4f4a-a753-e613df3d389e",
+   "id": "6b5af98b-c26b-4fd7-8031-31ac4915cdac",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# one way to load the JSON result with key info stored as \"snippets\"\n",
-    "from llama_index import download_loader\n",
-    "\n",
-    "JsonDataReader = download_loader(\"JsonDataReader\")\n",
-    "loader = JsonDataReader()\n",
-    "documents = loader.load_data([hit[\"snippets\"] for hit in data[\"hits\"]])\n"
+    "context"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "8e5e3b4e",
+   "id": "0f4ea96b-bb00-4a1f-8bd2-7f15237415f6",
    "metadata": {},
    "source": [
-    "With the data set up, we create a vector store for the data and a query engine for it.\n",
-    "\n",
-    "For our embeddings we will use `OctoAIEmbeddings` whose default embedding model is GTE-Large. This model provides a good balance between speed and performance.\n",
-    "\n",
-    "For more info see https://octoai.cloud/tools/text/embeddings?mode=demo&model=thenlper%2Fgte-large. "
+    "Create documents based on the search results, index and save them to a vector store, then create a query engine."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "a5de3080-2c4b-479c-baba-793b3bee36ed",
+   "id": "7513ac70-155a-4d56-b326-0e8c2733ab99",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# use OctoAI embeddings \n",
-    "from langchain_community.embeddings import OctoAIEmbeddings\n",
-    "from llama_index.embeddings import LangchainEmbedding\n",
-    "\n",
-    "\n",
-    "embeddings = LangchainEmbedding(OctoAIEmbeddings(\n",
-    "    endpoint_url=\"https://text.octoai.run/v1/embeddings\"\n",
-    "))\n",
-    "print(embeddings)\n",
-    "\n",
-    "# create a ServiceContext instance to use Llama2 and custom embeddings\n",
-    "service_context = ServiceContext.from_defaults(llm=llm, chunk_size=800, chunk_overlap=20, embed_model=embeddings)\n",
+    "from llama_index.core import Document\n",
     "\n",
-    "# create vector store index from the documents created above\n",
-    "index = VectorStoreIndex.from_documents(documents, service_context=service_context)\n",
+    "documents = [Document(text=ct['content']) for ct in context]\n",
+    "index = VectorStoreIndex.from_documents(documents)\n",
     "\n",
-    "# create query engine from the index\n",
-    "query_engine = index.as_query_engine(streaming=False)"
+    "query_engine = index.as_query_engine(streaming=True)"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "2c4ea012",
+   "id": "df743c62-165c-4834-b1f1-7d7848a6815e",
    "metadata": {},
    "source": [
-    "We are now ready to ask Llama 2 a question about the live data using our query engine."
+    "You are now ready to ask Llama 3 questions about the live data using the query engine."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "de91a191-d0f2-498e-88dc-b2b43423e0e5",
+   "id": "b2fd905b-575a-45f1-88da-9b093caa232a",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# ask Llama2 a summary question about the search result\n",
     "response = query_engine.query(\"give me a summary\")\n",
-    "print(str(response))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "72814b20-06aa-4da8-b4dd-f0b0d74a2ea0",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# more questions\n",
-    "print(str(query_engine.query(\"what products were announced\")))"
+    "response.print_response_stream()"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "a65bc037-a689-476d-b529-0059a27bc949",
+   "id": "88c45380-1d00-46d5-80ac-0eff68fd1f8a",
    "metadata": {},
    "outputs": [],
    "source": [
-    "print(str(query_engine.query(\"tell me more about Meta AI assistant\")))"
+    "query_engine.query(\"what's the latest about Llama 3 fine-tuning?\").print_response_stream()"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "16a56542",
+   "id": "0fe54976-5345-4426-a6f0-dc3bfd45dac3",
    "metadata": {},
    "outputs": [],
    "source": [
-    "print(str(query_engine.query(\"what are Generative AI stickers\")))"
+    "query_engine.query(\"tell me more about Llama 3 fine-tuning\").print_response_stream()"
    ]
   }
  ],

+ 27 - 31
recipes/llama_api_providers/OctoAI_API_examples/Llama2_Gradio.ipynb

@@ -5,14 +5,14 @@
    "id": "47a9adb3",
    "metadata": {},
    "source": [
-    "## This demo app shows how to query Llama 2 using the Gradio UI.\n",
+    "## This demo app shows how to query Llama 3 using the Gradio UI.\n",
     "\n",
     "Since we are using OctoAI in this example, you'll need to obtain an OctoAI token:\n",
     "\n",
     "- You will need to first sign into [OctoAI](https://octoai.cloud/) with your Github or Google account\n",
     "- Then create a free API token [here](https://octo.ai/docs/getting-started/how-to-create-an-octoai-access-token) that you can use for a while (a month or $10 in OctoAI credits, whichever one runs out first)\n",
     "\n",
-    "**Note** After the free trial ends, you will need to enter billing info to continue to use Llama2 hosted on OctoAI.\n",
+    "**Note** After the free trial ends, you will need to enter billing info to continue to use Llama 3 hosted on OctoAI.\n",
     "\n",
     "To run this example:\n",
     "- Run the notebook\n",
@@ -22,8 +22,7 @@
     "In the notebook or a browser with URL http://127.0.0.1:7860 you should see a UI with your answer.\n",
     "\n",
     "Let's start by installing the necessary packages:\n",
-    "- langchain provides necessary RAG tools for this demo\n",
-    "- octoai-sdk allows us to use OctoAI Llama 2 endpoint\n",
+    "- openai for us to use its APIs to talk to the OctoAI endpoint\n",
     "- gradio is used for the UI elements\n",
     "\n",
     "And setting up the OctoAI token."
@@ -36,7 +35,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!pip install langchain octoai-sdk gradio"
+    "!pip install openai gradio"
    ]
   },
   {
@@ -60,37 +59,34 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from langchain.schema import AIMessage, HumanMessage\n",
     "import gradio as gr\n",
-    "from langchain.llms.octoai_endpoint import OctoAIEndpoint\n",
+    "import openai\n",
     "\n",
-    "llama2_13b = \"llama-2-13b-chat-fp16\"\n",
-    "\n",
-    "llm = OctoAIEndpoint(\n",
-    "    endpoint_url=\"https://text.octoai.run/v1/chat/completions\",\n",
-    "    model_kwargs={\n",
-    "        \"model\": llama2_13b,\n",
-    "        \"messages\": [\n",
-    "            {\n",
-    "                \"role\": \"system\",\n",
-    "                \"content\": \"You are a helpful, respectful and honest assistant.\"\n",
-    "            }\n",
-    "        ],\n",
-    "        \"max_tokens\": 500,\n",
-    "        \"top_p\": 1,\n",
-    "        \"temperature\": 0.01\n",
-    "    },\n",
+    "# Init OctoAI client\n",
+    "client = openai.OpenAI(\n",
+    "    base_url=\"https://text.octoai.run/v1\",\n",
+    "    api_key=os.environ[\"OCTOAI_API_TOKEN\"]\n",
     ")\n",
     "\n",
-    "\n",
     "def predict(message, history):\n",
-    "    history_langchain_format = []\n",
-    "    for human, ai in history:\n",
-    "        history_langchain_format.append(HumanMessage(content=human))\n",
-    "        history_langchain_format.append(AIMessage(content=ai))\n",
-    "    history_langchain_format.append(HumanMessage(content=message))\n",
-    "    llm_response = llm(message, history_langchain_format)\n",
-    "    return llm_response.content\n",
+    "    history_openai_format = []\n",
+    "    for human, assistant in history:\n",
+    "        history_openai_format.append({\"role\": \"user\", \"content\": human})\n",
+    "        history_openai_format.append({\"role\": \"assistant\", \"content\": assistant})\n",
+    "    history_openai_format.append({\"role\": \"user\", \"content\": message})\n",
+    "\n",
+    "    response = client.chat.completions.create(\n",
+    "        model = 'meta-llama-3-70b-instruct',\n",
+    "        messages = history_openai_format,\n",
+    "        temperature = 0.0,\n",
+    "        stream = True\n",
+    "     )\n",
+    "\n",
+    "    partial_message = \"\"\n",
+    "    for chunk in response:\n",
+    "        if chunk.choices[0].delta.content is not None:\n",
+    "              partial_message = partial_message + chunk.choices[0].delta.content\n",
+    "              yield partial_message\n",
     "\n",
     "gr.ChatInterface(predict).launch()"
    ]

+ 23 - 29
recipes/llama_api_providers/OctoAI_API_examples/RAG_Chatbot_example/RAG_Chatbot_Example.ipynb

@@ -4,16 +4,16 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Building a Llama 2 chatbot with Retrieval Augmented Generation (RAG)\n",
+    "# Building a Llama 3 chatbot with Retrieval Augmented Generation (RAG)\n",
     "\n",
     "This notebook shows a complete example of how to build a Llama 2 chatbot hosted on your browser that can answer questions based on your own data. We'll cover:\n",
-    "* How to run Llama2 in the cloud hosted on OctoAI\n",
+    "* How to run Llama 3 in the cloud hosted on OctoAI\n",
     "* A chatbot example built with [Gradio](https://github.com/gradio-app/gradio) and wired to the server\n",
-    "* Adding RAG capability with Llama 2 specific knowledge based on our Getting Started [guide](https://ai.meta.com/llama/get-started/)\n",
+    "* Adding RAG capability with Llama 3 specific knowledge based on our Getting Started [guide](https://ai.meta.com/llama/get-started/)\n",
     "\n",
     "\n",
     "**Note** We will be using OctoAI to run the examples here. You will need to first sign into [OctoAI](https://octoai.cloud/) with your Github or Google account, then create a free API token [here](https://octo.ai/docs/getting-started/how-to-create-an-octoai-access-token) that you can use for a while (a month or $10 in OctoAI credits, whichever one runs out first).\n",
-    "After the free trial ends, you will need to enter billing info to continue to use Llama2 hosted on OctoAI."
+    "After the free trial ends, you will need to enter billing info to continue to use Llama 3 hosted on OctoAI."
    ]
   },
   {
@@ -51,14 +51,14 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## How to Develop a RAG Powered Llama 2 Chatbot\n",
+    "## How to Develop a RAG Powered Llama 3 Chatbot\n",
     "\n",
-    "The easiest way to develop RAG-powered Llama 2 chatbots is to use frameworks such as [**LangChain**](https://www.langchain.com/) and [**LlamaIndex**](https://www.llamaindex.ai/), two leading open-source frameworks for building LLM apps. Both offer convenient APIs for implementing RAG with Llama 2 including:\n",
+    "The easiest way to develop RAG-powered Llama 3 chatbots is to use frameworks such as [**LangChain**](https://www.langchain.com/) and [**LlamaIndex**](https://www.llamaindex.ai/), two leading open-source frameworks for building LLM apps. Both offer convenient APIs for implementing RAG with Llama 3 including:\n",
     "\n",
     "* Load and split documents\n",
     "* Embed and store document splits\n",
     "* Retrieve the relevant context based on the user query\n",
-    "* Call Llama 2 with query and context to generate the answer\n",
+    "* Call Llama 3 with query and context to generate the answer\n",
     "\n",
     "LangChain is a more general purpose and flexible framework for developing LLM apps with RAG capabilities, while LlamaIndex as a data framework focuses on connecting custom data sources to LLMs. The integration of the two may provide the best performant and effective solution to building real world RAG apps.\n",
     "In our example, for simplicifty, we will use LangChain alone with locally stored PDF data."
@@ -73,7 +73,7 @@
     "For this demo, we will be using the Gradio for chatbot UI, Text-generation-inference framework for model serving.\n",
     "For vector storage and similarity search, we will be using [FAISS](https://github.com/facebookresearch/faiss).\n",
     "In this example, we will be running everything in a AWS EC2 instance (i.e. [g5.2xlarge]( https://aws.amazon.com/ec2/instance-types/g5/)). g5.2xlarge features one A10G GPU. We recommend running this notebook with at least one GPU equivalent to A10G with at least 16GB video memory.\n",
-    "There are certain techniques to downsize the Llama 2 7B model, so it can fit into smaller GPUs. But it is out of scope here.\n",
+    "There are certain techniques to downsize the Llama 3 7B model, so it can fit into smaller GPUs. But it is out of scope here.\n",
     "\n",
     "First, let's install all dependencies with PIP. We also recommend you start a dedicated Conda environment for better package management.\n",
     "\n",
@@ -109,7 +109,7 @@
     "### Data Processing\n",
     "\n",
     "First run all the imports and define the path of the data and vector storage after processing.\n",
-    "For the data, we will be using a raw pdf crawled from Llama 2 Getting Started guide on [Meta AI website](https://ai.meta.com/llama/)."
+    "For the data, we will be using a raw pdf crawled from \"Llama 2 Getting Started\" guide on [Meta AI website](https://ai.meta.com/llama/)."
    ]
   },
   {
@@ -276,14 +276,12 @@
     "from langchain.prompts.prompt import PromptTemplate\n",
     "from anyio.from_thread import start_blocking_portal #For model callback streaming\n",
     "\n",
-    "# langchain.debug=True\n",
-    "\n",
-    "#vector db path\n",
+    "# Vector db path\n",
     "DB_FAISS_PATH = 'vectorstore/db_faiss'\n",
     "\n",
     "model_dict = {\n",
-    "    \"13-chat\" : \"llama-2-13b-chat-fp16\",\n",
-    "    \"70b-chat\" : \"llama-2-70b-chat-fp16\",\n",
+    "    \"8b-instruct\" : \"meta-llama-3-8b-instruct\",\n",
+    "    \"70b-instruct\" : \"meta-llama-3-70b-instruct\",\n",
     "}\n",
     "\n",
     "system_message = {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"}"
@@ -303,22 +301,24 @@
    "outputs": [],
    "source": [
     "embeddings = OctoAIEmbeddings(endpoint_url=\"https://text.octoai.run/v1/embeddings\")\n",
-    "db = FAISS.load_local(DB_FAISS_PATH, embeddings)"
+    "db = FAISS.load_local(DB_FAISS_PATH, embeddings, allow_dangerous_deserialization=True)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Next we call the Llama 2 model from OctoAI. In this example we will use the Llama 2 13b chat FP16 model. You can find more on Llama 2 models on the [OctoAI text generation solution page](https://octoai.cloud/tools/text).\n",
+    "Next we call the Llama 3 model from OctoAI. In this example we will use the Llama 3 8b instruct model. You can find more on Llama models on the [OctoAI text generation solution page](https://octoai.cloud/text).\n",
     "\n",
     "At the time of writing this notebook the following Llama models are available on OctoAI:\n",
-    "* llama-2-13b-chat\n",
-    "* llama-2-70b-chat\n",
+    "* meta-llama-3-8b-instruct\n",
+    "* meta-llama-3-70b-instruct\n",
     "* codellama-7b-instruct\n",
     "* codellama-13b-instruct\n",
     "* codellama-34b-instruct\n",
-    "* codellama-70b-instruct"
+    "* llama-2-13b-chat\n",
+    "* llama-2-70b-chat\n",
+    "* llamaguard-7b"
    ]
   },
   {
@@ -329,16 +329,10 @@
    "source": [
     "from langchain.llms.octoai_endpoint import OctoAIEndpoint\n",
     "\n",
-    "llama2_13b = \"llama-2-13b-chat-fp16\"\n",
     "llm = OctoAIEndpoint(\n",
-    "    endpoint_url=\"https://text.octoai.run/v1/chat/completions\",\n",
-    "    model_kwargs={\n",
-    "        \"model\": llama2_13b,\n",
-    "        \"messages\": [system_message],\n",
-    "        \"max_tokens\": 500,\n",
-    "        \"top_p\": 1,\n",
-    "        \"temperature\": 0.01\n",
-    "    },\n",
+    "    model=model_dict[\"8b-instruct\"],\n",
+    "    max_tokens=500,\n",
+    "    temperature=0.01\n",
     ")"
    ]
   },
@@ -347,7 +341,7 @@
    "metadata": {},
    "source": [
     "Next, we define the retriever and template for our RetrivalQA chain. For each call of the RetrievalQA, LangChain performs a semantic similarity search of the query in the vector database, then passes the search results as the context to Llama to answer the query about the data stored in the verctor database.\n",
-    "Whereas for the template, this defines the format of the question along with context that we will be sent into Llama for generation. In general, Llama 2 has special prompt format to handle special tokens. In some cases, the serving framework might already have taken care of it. Otherwise, you will need to write customized template to properly handle that."
+    "Whereas for the template, this defines the format of the question along with context that we will be sent into Llama for generation. In general, Llama 3 has special prompt format to handle special tokens. In some cases, the serving framework might already have taken care of it. Otherwise, you will need to write customized template to properly handle that."
    ]
   },
   {

+ 3 - 3
recipes/llama_api_providers/OctoAI_API_examples/RAG_Chatbot_example/requirements.txt

@@ -1,7 +1,7 @@
-gradio==4.16.0
+gradio==4.19.2
 pypdf==4.0.0
-langchain==0.1.7
+langchain==0.1.19
 sentence-transformers==2.2.2
 faiss-cpu==1.7.4
 text-generation==0.6.1
-octoai-sdk==0.8.3
+octoai-sdk==0.10.1

+ 79 - 126
recipes/llama_api_providers/OctoAI_API_examples/VideoSummary.ipynb

@@ -7,8 +7,8 @@
    "source": [
     "## This demo app shows:\n",
     "* How to use LangChain's YoutubeLoader to retrieve the caption in a YouTube video\n",
-    "* How to ask Llama to summarize the content (per the Llama's input size limit) of the video in a naive way using LangChain's stuff method\n",
-    "* How to bypass the limit of Llama's max input token size by using a more sophisticated way using LangChain's map_reduce and refine methods - see [here](https://python.langchain.com/docs/use_cases/summarization) for more info"
+    "* How to ask Llama 3 to summarize the content (per the Llama's input size limit) of the video in a naive way using LangChain's stuff method\n",
+    "* How to bypass the limit of Llama 3's max input token size by using a more sophisticated way using LangChain's map_reduce and refine methods - see [here](https://python.langchain.com/docs/use_cases/summarization) for more info"
    ]
   },
   {
@@ -22,7 +22,7 @@
     "- [tiktoken](https://github.com/openai/tiktoken) BytePair Encoding tokenizer\n",
     "- [pytube](https://pytube.io/en/latest/) Utility for downloading YouTube videos\n",
     "\n",
-    "**Note** This example uses OctoAI to host the Llama model. If you have not set up/or used OctoAI before, we suggest you take a look at the [HelloLlamaCloud](HelloLlamaCloud.ipynb) example for information on how to set up OctoAI before continuing with this example.\n",
+    "**Note** This example uses OctoAI to host the Llama 3 model. If you have not set up/or used OctoAI before, we suggest you take a look at the [HelloLlamaCloud](HelloLlamaCloud.ipynb) example for information on how to set up OctoAI before continuing with this example.\n",
     "If you do not want to use OctoAI, you will need to make some changes to this notebook as you go along."
    ]
   },
@@ -33,7 +33,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!pip install langchain octoai-sdk youtube-transcript-api tiktoken pytube"
+    "!pip install langchain==0.1.19 youtube-transcript-api tiktoken pytube"
    ]
   },
   {
@@ -41,7 +41,7 @@
    "id": "af3069b1",
    "metadata": {},
    "source": [
-    "Let's load the YouTube video transcript using the YoutubeLoader."
+    "Let's first load a long (2:47:16) YouTube video (Lex Fridman with Yann Lecun: Meta AI, Open Source, Limits of LLMs, AGI & the Future of AI) transcript using the YoutubeLoader."
    ]
   },
   {
@@ -54,7 +54,7 @@
     "from langchain.document_loaders import YoutubeLoader\n",
     "\n",
     "loader = YoutubeLoader.from_youtube_url(\n",
-    "    \"https://www.youtube.com/watch?v=1k37OcjH7BM\", add_video_info=True\n",
+    "    \"https://www.youtube.com/watch?v=5t1vTLU7s40\", add_video_info=True\n",
     ")"
    ]
   },
@@ -85,17 +85,16 @@
    "id": "4af7cc16",
    "metadata": {},
    "source": [
-    "We are using OctoAI in this example to host our Llama 2 model so you will need to get a OctoAI token.\n",
+    "You should see 142689 returned for the doc character length, which is about 30k words or 40k tokens, beyond the 8k context length limit of Llama 3. You'll see how to summarize a text longer than the limit.\n",
+    "\n",
+    "**Note**: We are using OctoAI in this example to host our Llama 3 model so you will need to get a OctoAI token.\n",
     "\n",
     "To get the OctoAI token:\n",
     "\n",
     "- You will need to first sign in with OctoAI with your github account\n",
     "- Then create a free API token [here](https://octo.ai/docs/getting-started/how-to-create-an-octoai-access-token) that you can use for a while (a month or $10 in OctoAI credits, whichever one runs out first)\n",
     "\n",
-    "**Note** After the free trial ends, you will need to enter billing info to continue to use Llama2 hosted on OctoAI.\n",
-    "\n",
-    "Alternatively, you can run Llama locally. See:\n",
-    "- [HelloLlamaLocal](HelloLlamaLocal.ipynb) for further information on how to run Llama locally."
+    "After the free trial ends, you will need to enter billing info to continue to use Llama2 hosted on OctoAI."
    ]
   },
   {
@@ -118,17 +117,17 @@
    "id": "6b911efd",
    "metadata": {},
    "source": [
-    "Next we call the Llama 2 model from OctoAI. In this example we will use the Llama 2 13b chat FP16 model. You can find more on Llama 2 models on the [OctoAI text generation solution page](https://octoai.cloud/tools/text).\n",
+    "Next we call the Llama 3 model from OctoAI. In this example we will use the Llama 3 8b instruct model. You can find more on Llama models on the [OctoAI text generation solution page](https://octoai.cloud/text).\n",
     "\n",
     "At the time of writing this notebook the following Llama models are available on OctoAI:\n",
-    "* llama-2-13b-chat\n",
-    "* llama-2-70b-chat\n",
+    "* meta-llama-3-8b-instruct\n",
+    "* meta-llama-3-70b-instruct\n",
     "* codellama-7b-instruct\n",
     "* codellama-13b-instruct\n",
     "* codellama-34b-instruct\n",
-    "* codellama-70b-instruct\n",
-    "\n",
-    "If you using local Llama, just set llm accordingly - see the [HelloLlamaLocal notebook](HelloLlamaLocal.ipynb)"
+    "* llama-2-13b-chat\n",
+    "* llama-2-70b-chat\n",
+    "* llamaguard-7b"
    ]
   },
   {
@@ -140,21 +139,11 @@
    "source": [
     "from langchain.llms.octoai_endpoint import OctoAIEndpoint\n",
     "\n",
-    "llama2_13b = \"llama-2-13b-chat-fp16\"\n",
+    "llama3_8b = \"meta-llama-3-8b-instruct\"\n",
     "llm = OctoAIEndpoint(\n",
-    "    endpoint_url=\"https://text.octoai.run/v1/chat/completions\",\n",
-    "    model_kwargs={\n",
-    "        \"model\": llama2_13b,\n",
-    "        \"messages\": [\n",
-    "            {\n",
-    "                \"role\": \"system\",\n",
-    "                \"content\": \"You are a helpful, respectful and honest assistant.\"\n",
-    "            }\n",
-    "        ],\n",
-    "        \"max_tokens\": 500,\n",
-    "        \"top_p\": 1,\n",
-    "        \"temperature\": 0.01\n",
-    "    },\n",
+    "    model=llama3_8b,\n",
+    "    max_tokens=500,\n",
+    "    temperature=0.01\n",
     ")"
    ]
   },
@@ -163,7 +152,7 @@
    "id": "8e3baa56",
    "metadata": {},
    "source": [
-    "Once everything is set up, we prompt Llama 2 to summarize the first 4000 characters of the transcript for us."
+    "Once everything is set up, we prompt Llama 3 to summarize the first 4000 characters of the transcript for us."
    ]
   },
   {
@@ -173,90 +162,74 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from langchain.prompts import ChatPromptTemplate\n",
+    "from langchain.prompts import PromptTemplate\n",
     "from langchain.chains import LLMChain\n",
-    "prompt = ChatPromptTemplate.from_template(\n",
-    "    \"Give me a summary of the text below: {text}?\"\n",
+    "\n",
+    "prompt_template = \"Give me a summary of the text below: {text}?\"\n",
+    "prompt = PromptTemplate(\n",
+    "    input_variables=[\"text\"], template=prompt_template\n",
     ")\n",
-    "chain = LLMChain(llm=llm, prompt=prompt)\n",
+    "chain = prompt | llm\n",
+    "\n",
     "# be careful of the input text length sent to LLM\n",
-    "text = docs[0].page_content[:4000]\n",
-    "summary = chain.run(text)\n",
-    "# this is the summary of the first 4000 characters of the video content\n",
+    "text = docs[0].page_content[:10000]\n",
+    "summary = chain.invoke(text)\n",
+    "\n",
+    "# Note: The context length of 8k tokens in Llama 3 is roughly 6000-7000 words or 32k characters\n",
     "print(summary)"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "8b684b29",
+   "id": "1ad1881a",
    "metadata": {},
    "source": [
-    "Next we try to summarize all the content of the transcript and we should get a `RuntimeError: Your input is too long. Max input length is 4096 tokens, but you supplied 5597 tokens.`."
+    "If you try the whole content which has over 142k characters, about 40k tokens, which exceeds the 8k limit, you'll get an empty result (OctoAI used to return an error \"BadRequestError: The token count (32704) of your prompt (32204) + your setting of `max_tokens` (500) cannot exceed this model's context length (8192).\")."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "88a2c17f",
+   "id": "61a088b7-cba2-4603-ba7c-f6673bfaa3cd",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# try to get a summary of the whole content\n",
+    "# this will generate an empty result because the input exceeds Llama 3's context length limit\n",
     "text = docs[0].page_content\n",
-    "summary = chain.run(text)\n",
+    "summary = llm.invoke(f\"Give me a summary of the text below: {text}.\")\n",
     "print(summary)"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "1ad1881a",
+   "id": "e112845f-de16-4c2f-8afe-6cca31f6fa38",
    "metadata": {},
    "source": [
+    "To fix this, you can use LangChain's load_summarize_chain method (detail [here](https://python.langchain.com/docs/use_cases/summarization)).\n",
     "\n",
-    "Let's try some workarounds to see if we can summarize the entire transcript without running into the `RuntimeError`.\n",
+    "First you'll create splits or sub-documents of the original content, then use the LangChain's `load_summarize_chain` with the `refine` or `map_reduce type`.\n",
     "\n",
-    "We will use the LangChain's `load_summarize_chain` and play around with the `chain_type`.\n"
+    "Because this may involve many calls to Llama 3, it'd be great to set up a quick free LangChain API key [here](https://smith.langchain.com/settings), run the following cell to set up necessary environment variables, and check the logs on [LangSmith](https://docs.smith.langchain.com/) during and after the run."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "9bfee2d3-3afe-41d9-8968-6450cc23f493",
+   "id": "55586a09-db53-4741-87d8-fdfb40d9f8cb",
    "metadata": {},
    "outputs": [],
    "source": [
-    "from langchain.chains.summarize import load_summarize_chain\n",
-    "# see https://python.langchain.com/docs/use_cases/summarization for more info\n",
-    "chain = load_summarize_chain(llm, chain_type=\"stuff\") # other supported methods are map_reduce and refine\n",
-    "chain.run(docs)\n",
-    "# same RuntimeError: Your input is too long. but stuff works for shorter text with input length <= 4096 tokens"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "682799a8-3846-41b1-a908-02ab5ac3ecee",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "chain = load_summarize_chain(llm, chain_type=\"refine\")\n",
-    "# still get the \"RuntimeError: Your input is too long. Max input length is 4096 tokens\"\n",
-    "chain.run(docs)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "aecf6328",
-   "metadata": {},
-   "source": [
-    "\n",
-    "Since the transcript is bigger than the model can handle, we can split the transcript into chunks instead and use the [`refine`](https://python.langchain.com/docs/modules/chains/document/refine) `chain_type` to iteratively create an answer."
+    "import os\n",
+    "os.environ[\"LANGCHAIN_API_KEY\"] = \"your_langchain_api_key\"\n",
+    "os.environ[\"LANGCHAIN_API_KEY\"] = \"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\"\n",
+    "os.environ[\"LANGCHAIN_TRACING_V2\"] = \"true\"\n",
+    "os.environ[\"LANGCHAIN_PROJECT\"] = \"Video Summary with Llama 3\""
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "3be1236a-fe6a-4bf6-983f-0e72dde39fee",
+   "id": "9bfee2d3-3afe-41d9-8968-6450cc23f493",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -264,7 +237,7 @@
     "\n",
     "# we need to split the long input text\n",
     "text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(\n",
-    "    chunk_size=3000, chunk_overlap=0\n",
+    "    chunk_size=1000, chunk_overlap=0\n",
     ")\n",
     "split_docs = text_splitter.split_documents(docs)"
    ]
@@ -272,7 +245,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "12ae9e9d-3434-4a84-a298-f2b98de9ff01",
+   "id": "682799a8-3846-41b1-a908-02ab5ac3ecee",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -281,81 +254,61 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "127f17fe-d5b7-43af-bd2f-2b47b076d0b1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# now get the summary of the whole docs - the whole youtube content\n",
-    "chain = load_summarize_chain(llm, chain_type=\"refine\")\n",
-    "print(str(chain.run(split_docs)))"
-   ]
-  },
-  {
    "cell_type": "markdown",
-   "id": "c3976c92",
+   "id": "aecf6328",
    "metadata": {},
    "source": [
-    "You can also use [`map_reduce`](https://python.langchain.com/docs/modules/chains/document/map_reduce) `chain_type` to implement a map reduce like architecture while summarizing the documents."
+    "The `refine` type implements the following steps under the hood:\n",
+    "\n",
+    "1. Call Llama 3 on the first sub-document to generate a concise summary;\n",
+    "2. Loop over each subsequent sub-document, pass the previous summary with the current sub-document to generate a refined new summary;\n",
+    "3. Return the final summary generated on the final sub-document as the final answer - the summary of the whole content.\n",
+    "\n",
+    "An example prompt template for each call in step 2, which gets used under the hood by LangChain, is:\n",
+    "\n",
+    "```\n",
+    "Your job is to produce a final summary.\n",
+    "We have provided an existing summary up to a certain point:\n",
+    "<previous_summary>\n",
+    "Refine the existing summary (only if needed) with some more content below:\n",
+    "<new_content>\n",
+    "```\n",
+    "\n",
+    "**Note**: The following call will make 33 calls to Llama 3 and genereate the final summary in about 10 minutes."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "8991df49-8578-46de-8b30-cb2cd11e30f1",
+   "id": "3be1236a-fe6a-4bf6-983f-0e72dde39fee",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# another method is map_reduce\n",
-    "chain = load_summarize_chain(llm, chain_type=\"map_reduce\")\n",
-    "print(str(chain.run(split_docs)))"
+    "from langchain.chains.summarize import load_summarize_chain\n",
+    "\n",
+    "chain = load_summarize_chain(llm, chain_type=\"refine\")\n",
+    "print(chain.run(split_docs))"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "77d580de",
+   "id": "752f2b71-5fd6-4a8a-ac09-371bce1db703",
    "metadata": {},
    "source": [
-    "To investigate further, let's turn on Langchain's debug mode on to get an idea of how many calls are made to the model and the details of the inputs and outputs.\n",
-    "We will then run our summary using the `stuff` and `refine` `chain_types` and take a look at our output."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f2138911-d2b9-41f3-870f-9bc37e2043d9",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# to find how many calls to Llama have been made and the details of inputs and outputs of each call, set langchain to debug\n",
-    "import langchain\n",
-    "langchain.debug = True\n",
+    "You can also set `chain_type` to `map_reduce` to generate the summary of the entire content using the standard map and reduce method, which works behind the scene by first mapping each split document to a sub-summary via a call to LLM, then combines all those sub-summaries into a single final summary by yet another call to LLM.\n",
     "\n",
-    "# stuff method will cause the error in the end\n",
-    "chain = load_summarize_chain(llm, chain_type=\"stuff\")\n",
-    "chain.run(split_docs)"
+    "**Note**: The following call takes about 3 minutes and all the calls to Llama 3."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "60d1a531-ab48-45cc-a7de-59a14e18240d",
+   "id": "8991df49-8578-46de-8b30-cb2cd11e30f1",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# but refine works\n",
-    "chain = load_summarize_chain(llm, chain_type=\"refine\")\n",
-    "chain.run(split_docs)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "61ccd0fb-5cdb-43c4-afaf-05bc9f7cf959",
-   "metadata": {},
-   "source": [
-    "\n",
-    "As you can see, `stuff` fails because it tries to treat all the split documents as one and \"stuffs\" it into one prompt which leads to a much larger prompt than Llama 2 can handle while `refine` iteratively runs over the documents updating its answer as it goes."
+    "chain = load_summarize_chain(llm, chain_type=\"map_reduce\")\n",
+    "print(chain.run(split_docs))"
    ]
   }
  ],

+ 937 - 0
recipes/llama_api_providers/llama3_cookbook_groq.ipynb

@@ -0,0 +1,937 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "09211e76-286f-4b12-acd7-cfb082dc2d66",
+   "metadata": {},
+   "source": [
+    "# Llama 3 Cookbook with LlamaIndex and Groq\n",
+    "\n",
+    "<a href=\"https://colab.research.google.com/github/meta-llama/llama-recipes/blob/main/recipes/llama_api_providers/llama3_cookbook_groq.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
+    "\n",
+    "Meta developed and released the Meta [Llama 3](https://ai.meta.com/blog/meta-llama-3/) family of large language models (LLMs), a collection of pretrained and instruction tuned generative text models in 8 and 70B sizes. The Llama 3 instruction tuned models are optimized for dialogue use cases and outperform many of the available open source chat models on common industry benchmarks.\n",
+    "\n",
+    "In this notebook, we demonstrate how to use Llama 3 with LlamaIndex for a comprehensive set of use cases. \n",
+    "1. Basic completion / chat \n",
+    "2. Basic RAG (Vector Search, Summarization)\n",
+    "3. Advanced RAG (Routing)\n",
+    "4. Text-to-SQL \n",
+    "5. Structured Data Extraction\n",
+    "6. Chat Engine + Memory\n",
+    "7. Agents\n",
+    "\n",
+    "\n",
+    "We use Llama3-8B and Llama3-70B through [Groq](https://groq.com) - you can sign up there to get a free trial API key."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "de2901c0-e20d-48e5-9385-dbca2258c564",
+   "metadata": {},
+   "source": [
+    "## Installation and Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bcf643ac-b025-4812-aaed-f8f85d1ba505",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install llama-index\n",
+    "!pip install llama-index-llms-groq\n",
+    "!pip install llama-index-embeddings-huggingface\n",
+    "!pip install llama-parse"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "641fa5c8-d63e-47f8-b5bc-ebf994f6e314",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import nest_asyncio\n",
+    "\n",
+    "nest_asyncio.apply()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1714ea83-6cd4-44bb-b53f-4499126c3809",
+   "metadata": {},
+   "source": [
+    "### Setup LLM using Groq\n",
+    "\n",
+    "To use [Groq](https://groq.com), you need to make sure that `GROQ_API_KEY` is specified as an environment variable."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5d46440c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "os.environ[\"GROQ_API_KEY\"] = \"YOUR_GROQ_API_KEY\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d5256970-eba4-499a-b438-8766a290a61a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.llms.groq import Groq\n",
+    "\n",
+    "llm = Groq(model=\"llama3-8b-8192\")\n",
+    "llm_70b = Groq(model=\"llama3-70b-8192\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "41c3f154-d345-465d-8eed-63b99adbd3ca",
+   "metadata": {},
+   "source": [
+    "### Setup Embedding Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0cda736d-e414-44e3-8c15-6be49f5f0282",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.embeddings.huggingface import HuggingFaceEmbedding\n",
+    "\n",
+    "embed_model = HuggingFaceEmbedding(model_name=\"BAAI/bge-small-en-v1.5\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3625cf29-7c56-475a-8efd-fbe8ffce194d",
+   "metadata": {},
+   "source": [
+    "### Define Global Settings Configuration\n",
+    "\n",
+    "In LlamaIndex, you can define global settings so you don't have to pass the LLM / embedding model objects everywhere."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "be3565d1-cc5b-4149-ad5a-7be8f7818e0c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.core import Settings\n",
+    "\n",
+    "Settings.llm = llm\n",
+    "Settings.embed_model = embed_model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "42449b68-47f5-40cf-9207-191307b25e8e",
+   "metadata": {},
+   "source": [
+    "### Download Data\n",
+    "\n",
+    "Here you'll download data that's used in section 2 and onwards.\n",
+    "\n",
+    "We'll download some articles on Kendrick, Drake, and their beef (as of May 2024)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "59b18640-cdfa-42c1-ab53-115983c1fdc4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!mkdir data\n",
+    "!wget \"https://www.dropbox.com/scl/fi/t1soxfjdp0v44an6sdymd/drake_kendrick_beef.pdf?rlkey=u9546ymb7fj8lk2v64r6p5r5k&st=wjzzrgil&dl=1\" -O data/drake_kendrick_beef.pdf\n",
+    "!wget \"https://www.dropbox.com/scl/fi/nts3n64s6kymner2jppd6/drake.pdf?rlkey=hksirpqwzlzqoejn55zemk6ld&st=mohyfyh4&dl=1\" -O data/drake.pdf\n",
+    "!wget \"https://www.dropbox.com/scl/fi/8ax2vnoebhmy44bes2n1d/kendrick.pdf?rlkey=fhxvn94t5amdqcv9vshifd3hj&st=dxdtytn6&dl=1\" -O data/kendrick.pdf"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9edee491-05f8-4fbb-9394-baa82f1e5087",
+   "metadata": {},
+   "source": [
+    "### Load Data\n",
+    "\n",
+    "We load data using LlamaParse by default, but you can also choose to opt for our free pypdf reader (in SimpleDirectoryReader by default) if you don't have an account! \n",
+    "\n",
+    "1. LlamaParse: Signup for an account here: cloud.llamaindex.ai. You get 1k free pages a day, and paid plan is 7k free pages + 0.3c per additional page. LlamaParse is a good option if you want to parse complex documents, like PDFs with charts, tables, and more. \n",
+    "\n",
+    "2. Default PDF Parser (In `SimpleDirectoryReader`). If you don't want to signup for an account / use a PDF service, just use the default PyPDF reader bundled in our file loader. It's a good choice for getting started!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b648635a-2672-407f-bae6-01660e5426d7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Uncomment this code if you want to use LlamaParse\n",
+    "# from llama_parse import LlamaParse\n",
+    "\n",
+    "# docs_kendrick = LlamaParse(result_type=\"text\").load_data(\"./data/kendrick.pdf\")\n",
+    "# docs_drake = LlamaParse(result_type=\"text\").load_data(\"./data/drake.pdf\")\n",
+    "# docs_both = LlamaParse(result_type=\"text\").load_data(\n",
+    "#     \"./data/drake_kendrick_beef.pdf\"\n",
+    "# )\n",
+    "\n",
+    "# Uncomment this code if you want to use SimpleDirectoryReader / default PDF Parser\n",
+    "# from llama_index.core import SimpleDirectoryReader\n",
+    "\n",
+    "# docs_kendrick = SimpleDirectoryReader(input_files=[\"data/kendrick.pdf\"]).load_data()\n",
+    "# docs_drake = SimpleDirectoryReader(input_files=[\"data/drake.pdf\"]).load_data()\n",
+    "# docs_both = SimpleDirectoryReader(input_files=[\"data/drake_kendrick_beef.pdf\"]).load_data()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "071a8f44-2765-4d57-b8da-15d3c718874d",
+   "metadata": {},
+   "source": [
+    "## 1. Basic Completion and Chat"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c0b1ace8-32fb-46b2-a065-8817ddc0310b",
+   "metadata": {},
+   "source": [
+    "### Call complete with a prompt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a2db43f9-74af-453c-9f83-8db0379c3302",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = llm.complete(\"do you like drake or kendrick better?\")\n",
+    "\n",
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "89326153-e2d2-4136-8193-fb27d20670c3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "stream_response = llm.stream_complete(\n",
+    "    \"you're a drake fan. tell me why you like drake more than kendrick\"\n",
+    ")\n",
+    "\n",
+    "for t in stream_response:\n",
+    "    print(t.delta, end=\"\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a4558339-c8a1-4d26-a430-eb71768b5351",
+   "metadata": {},
+   "source": [
+    "### Call chat with a list of messages"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5f393031-f743-4a28-a122-71817e3fbd1b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.core.llms import ChatMessage\n",
+    "\n",
+    "messages = [\n",
+    "    ChatMessage(role=\"system\", content=\"You are Kendrick.\"),\n",
+    "    ChatMessage(role=\"user\", content=\"Write a verse.\"),\n",
+    "]\n",
+    "response = llm.chat(messages)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8e9551fc-0efc-4671-bc57-339121004c39",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6a67a33d-fe7d-4381-983f-ca3a6945995d",
+   "metadata": {},
+   "source": [
+    "## 2. Basic RAG (Vector Search, Summarization)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c104a0c5-e43b-475b-9fa6-186906c1f327",
+   "metadata": {},
+   "source": [
+    "### Basic RAG (Vector Search)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "216787b7-e40a-43fc-a4ca-c43cb798ce9e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.core import VectorStoreIndex\n",
+    "\n",
+    "index = VectorStoreIndex.from_documents(docs_both)\n",
+    "query_engine = index.as_query_engine(similarity_top_k=3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a854e9d3-70f1-4927-a2f6-59e90c31f2f0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = query_engine.query(\"Tell me about family matters\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "da796970-bc38-4cb4-9d32-ebd1b71d4bdc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(str(response))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "eff935b7-4f37-4758-8997-82fb0852e732",
+   "metadata": {},
+   "source": [
+    "### Basic RAG (Summarization)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dfe72300-7a38-453e-b1f2-bc1c00a01ff7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.core import SummaryIndex\n",
+    "\n",
+    "summary_index = SummaryIndex.from_documents(docs_both)\n",
+    "summary_engine = summary_index.as_query_engine()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "178f1f12-51f7-4b45-9346-c16ed12b3b8d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = summary_engine.query(\n",
+    "    \"Given your assessment of this article, who won the beef?\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b8125382-d576-4b99-a0da-2fbb71a5b19b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(str(response))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "68918eb6-f1e6-460c-b1d5-fb49c3fed4b8",
+   "metadata": {},
+   "source": [
+    "## 3. Advanced RAG (Routing)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "94fd7097-0287-4522-8e43-3e088291fa8a",
+   "metadata": {},
+   "source": [
+    "### Build a Router that can choose whether to do vector search or summarization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3949dd41-e9a1-47f6-900f-4f987cad3f84",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.core.tools import QueryEngineTool, ToolMetadata\n",
+    "\n",
+    "vector_tool = QueryEngineTool(\n",
+    "    index.as_query_engine(),\n",
+    "    metadata=ToolMetadata(\n",
+    "        name=\"vector_search\",\n",
+    "        description=\"Useful for searching for specific facts.\",\n",
+    "    ),\n",
+    ")\n",
+    "\n",
+    "summary_tool = QueryEngineTool(\n",
+    "    index.as_query_engine(response_mode=\"tree_summarize\"),\n",
+    "    metadata=ToolMetadata(\n",
+    "        name=\"summary\",\n",
+    "        description=\"Useful for summarizing an entire document.\",\n",
+    "    ),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d063d07b-c03e-4b26-8556-e3c058d2fd52",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.core.query_engine import RouterQueryEngine\n",
+    "\n",
+    "query_engine = RouterQueryEngine.from_defaults(\n",
+    "    [vector_tool, summary_tool], select_multi=False, verbose=True, llm=llm_70b\n",
+    ")\n",
+    "\n",
+    "response = query_engine.query(\n",
+    "    \"Tell me about the song meet the grahams - why is it significant\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "396aad75-5a71-4bd9-a760-7f13fe223079",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a795f0bc-e871-4580-8983-6fb27d421fc5",
+   "metadata": {},
+   "source": [
+    "## 4. Text-to-SQL \n",
+    "\n",
+    "Here, we download and use a sample SQLite database with 11 tables, with various info about music, playlists, and customers. We will limit to a select few tables for this test."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a5096501-92c3-41af-a871-ade869d710fb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!wget \"https://www.sqlitetutorial.net/wp-content/uploads/2018/03/chinook.zip\" -O \"./data/chinook.zip\"\n",
+    "!unzip \"./data/chinook.zip\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d4db989e-c18d-4416-928e-7be4ead4d869",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sqlalchemy import (\n",
+    "    create_engine,\n",
+    "    MetaData,\n",
+    "    Table,\n",
+    "    Column,\n",
+    "    String,\n",
+    "    Integer,\n",
+    "    select,\n",
+    "    column,\n",
+    ")\n",
+    "\n",
+    "engine = create_engine(\"sqlite:///chinook.db\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bf6ed233-0ea3-4d4f-8c33-5b6d558b89b9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.core import SQLDatabase\n",
+    "\n",
+    "sql_database = SQLDatabase(engine)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "debae423-1004-40f6-9356-e1c3add4d965",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.core.indices.struct_store import NLSQLTableQueryEngine\n",
+    "\n",
+    "query_engine = NLSQLTableQueryEngine(\n",
+    "    sql_database=sql_database,\n",
+    "    tables=[\"albums\", \"tracks\", \"artists\"],\n",
+    "    llm=llm_70b,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a65ecd70-09c4-4872-b712-3a8235d03db2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = query_engine.query(\"What are some albums?\")\n",
+    "\n",
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c12b93ef-d6d1-4d15-9cb2-343070f72851",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = query_engine.query(\"What are some artists? Limit it to 5.\")\n",
+    "\n",
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2c243d38-c6ac-445c-b9d4-53a9ae013b7b",
+   "metadata": {},
+   "source": [
+    "This last query should be a more complex join"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "553741c2-1050-445d-979a-ae2150ee3248",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = query_engine.query(\n",
+    "    \"What are some tracks from the artist AC/DC? Limit it to 3\"\n",
+    ")\n",
+    "\n",
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "300689d7-9e67-4404-9898-27404ee6d4b5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(response.metadata[\"sql_query\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1419fe67-aa6a-47db-88cd-9bb251c15615",
+   "metadata": {},
+   "source": [
+    "## 5. Structured Data Extraction\n",
+    "\n",
+    "An important use case for function calling is extracting structured objects. LlamaIndex provides an intuitive interface for this through `structured_predict` - simply define the target Pydantic class (can be nested), and given a prompt, we extract out the desired object.\n",
+    "\n",
+    "**NOTE**: Since there's no native function calling support with Llama3, the structured extraction is performed by prompting the LLM + output parsing."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4432f35a-5f29-45e9-a928-32e6d77b158e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.llms.groq import Groq\n",
+    "from llama_index.core.prompts import PromptTemplate\n",
+    "from pydantic import BaseModel\n",
+    "\n",
+    "\n",
+    "class Restaurant(BaseModel):\n",
+    "    \"\"\"A restaurant with name, city, and cuisine.\"\"\"\n",
+    "\n",
+    "    name: str\n",
+    "    city: str\n",
+    "    cuisine: str\n",
+    "\n",
+    "\n",
+    "llm = Groq(model=\"llama3-8b-8192\", pydantic_program_mode=\"llm\")\n",
+    "prompt_tmpl = PromptTemplate(\n",
+    "    \"Generate a restaurant in a given city {city_name}\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2c451f52-a051-4ba2-a683-0c1fd258d986",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "restaurant_obj = llm.structured_predict(\n",
+    "    Restaurant, prompt_tmpl, city_name=\"Miami\"\n",
+    ")\n",
+    "print(restaurant_obj)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "839018a9-b65f-4824-83f7-2e4e52b55c5d",
+   "metadata": {},
+   "source": [
+    "## 6. Adding Chat History to RAG (Chat Engine)\n",
+    "\n",
+    "In this section we create a stateful chatbot from a RAG pipeline, with our chat engine abstraction.\n",
+    "\n",
+    "Unlike a stateless query engine, the chat engine maintains conversation history (through a memory module like buffer memory). It performs retrieval given a condensed question, and feeds the condensed question + context + chat history into the final LLM prompt.\n",
+    "\n",
+    "Related resource: https://docs.llamaindex.ai/en/stable/examples/chat_engine/chat_engine_condense_plus_context/"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "27e56315-9513-4b32-bf9a-ce97c3ab52df",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.core.memory import ChatMemoryBuffer\n",
+    "from llama_index.core.chat_engine import CondensePlusContextChatEngine\n",
+    "\n",
+    "memory = ChatMemoryBuffer.from_defaults(token_limit=3900)\n",
+    "\n",
+    "chat_engine = CondensePlusContextChatEngine.from_defaults(\n",
+    "    index.as_retriever(),\n",
+    "    memory=memory,\n",
+    "    llm=llm,\n",
+    "    context_prompt=(\n",
+    "        \"You are a chatbot, able to have normal interactions, as well as talk\"\n",
+    "        \" about the Kendrick and Drake beef.\"\n",
+    "        \"Here are the relevant documents for the context:\\n\"\n",
+    "        \"{context_str}\"\n",
+    "        \"\\nInstruction: Use the previous chat history, or the context above, to interact and help the user.\"\n",
+    "    ),\n",
+    "    verbose=True,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b24524d2-fdce-4237-8ecc-67f139302303",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = chat_engine.chat(\n",
+    "    \"Tell me about the songs Drake released in the beef.\"\n",
+    ")\n",
+    "print(str(response))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f9a87a16-2864-4c48-95e7-a2103e119242",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = chat_engine.chat(\"What about Kendrick?\")\n",
+    "print(str(response))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a7fa07ed-58f0-445e-bbd3-4ad8bac6598e",
+   "metadata": {},
+   "source": [
+    "## 7. Agents\n",
+    "\n",
+    "Here we build agents with Llama 3. We perform RAG over simple functions as well as the documents above."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "aa98d735-5d43-413f-aab3-fc3adeed81b1",
+   "metadata": {},
+   "source": [
+    "### Agents And Tools"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fb73a01f-8a2e-4dd6-91f8-710c92b81c56",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "from typing import Sequence, List\n",
+    "\n",
+    "from llama_index.core.llms import ChatMessage\n",
+    "from llama_index.core.tools import BaseTool, FunctionTool\n",
+    "from llama_index.core.agent import ReActAgent\n",
+    "\n",
+    "import nest_asyncio\n",
+    "\n",
+    "nest_asyncio.apply()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "efbee832-9786-4551-93f2-01ee90fa0f4d",
+   "metadata": {},
+   "source": [
+    "### Define Tools"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b2058b36-8053-4dc8-9218-c286702ecf66",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def multiply(a: int, b: int) -> int:\n",
+    "    \"\"\"Multiple two integers and returns the result integer\"\"\"\n",
+    "    return a * b\n",
+    "\n",
+    "\n",
+    "def add(a: int, b: int) -> int:\n",
+    "    \"\"\"Add two integers and returns the result integer\"\"\"\n",
+    "    return a + b\n",
+    "\n",
+    "\n",
+    "def subtract(a: int, b: int) -> int:\n",
+    "    \"\"\"Subtract two integers and returns the result integer\"\"\"\n",
+    "    return a - b\n",
+    "\n",
+    "\n",
+    "def divide(a: int, b: int) -> int:\n",
+    "    \"\"\"Divides two integers and returns the result integer\"\"\"\n",
+    "    return a / b\n",
+    "\n",
+    "\n",
+    "multiply_tool = FunctionTool.from_defaults(fn=multiply)\n",
+    "add_tool = FunctionTool.from_defaults(fn=add)\n",
+    "subtract_tool = FunctionTool.from_defaults(fn=subtract)\n",
+    "divide_tool = FunctionTool.from_defaults(fn=divide)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "22d7d4dc-e2ce-402c-9350-0e7010d0080c",
+   "metadata": {},
+   "source": [
+    "### ReAct Agent"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "72a48053-e30d-4884-bcac-80752047d940",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "agent = ReActAgent.from_tools(\n",
+    "    [multiply_tool, add_tool, subtract_tool, divide_tool],\n",
+    "    llm=llm_70b,\n",
+    "    verbose=True,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7ada828a-3b05-4fc1-90e8-986c5607ae61",
+   "metadata": {},
+   "source": [
+    "### Querying"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9c0b1e56-d9f7-4615-a15a-c91fea1adb00",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = agent.chat(\"What is (121 + 2) * 5?\")\n",
+    "print(str(response))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "67ce45f6-bdd4-42aa-8f74-43a50f14094e",
+   "metadata": {},
+   "source": [
+    "### ReAct Agent With RAG QueryEngine Tools"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "97fce5f1-eacf-4ecc-9e83-072e74d3a2a9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.core import (\n",
+    "    SimpleDirectoryReader,\n",
+    "    VectorStoreIndex,\n",
+    "    StorageContext,\n",
+    "    load_index_from_storage,\n",
+    ")\n",
+    "\n",
+    "from llama_index.core.tools import QueryEngineTool, ToolMetadata"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "23963d00-e3d2-4ce1-9ac3-aa486bf4b1a5",
+   "metadata": {},
+   "source": [
+    "### Create ReAct Agent using RAG QueryEngine Tools"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1844dbbd-477c-4c4d-bb18-2c2e16a75a50",
+   "metadata": {},
+   "source": [
+    "This may take 4 minutes to run:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "66ab1e60-3374-4eb9-b7dc-c28db3b47c51",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "drake_index = VectorStoreIndex.from_documents(docs_drake)\n",
+    "drake_query_engine = drake_index.as_query_engine(similarity_top_k=3)\n",
+    "\n",
+    "kendrick_index = VectorStoreIndex.from_documents(docs_kendrick)\n",
+    "kendrick_query_engine = kendrick_index.as_query_engine(similarity_top_k=3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0e241fe9-f390-4be5-b3c4-da4f56db01ef",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "drake_tool = QueryEngineTool(\n",
+    "    drake_index.as_query_engine(),\n",
+    "    metadata=ToolMetadata(\n",
+    "        name=\"drake_search\",\n",
+    "        description=\"Useful for searching over Drake's life.\",\n",
+    "    ),\n",
+    ")\n",
+    "\n",
+    "kendrick_tool = QueryEngineTool(\n",
+    "    kendrick_index.as_query_engine(),\n",
+    "    metadata=ToolMetadata(\n",
+    "        name=\"kendrick_search\",\n",
+    "        description=\"Useful for searching over Kendrick's life.\",\n",
+    "    ),\n",
+    ")\n",
+    "\n",
+    "query_engine_tools = [drake_tool, kendrick_tool]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b922feac-b221-4737-92c6-e63eeab4eab7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "agent = ReActAgent.from_tools(\n",
+    "    query_engine_tools,\n",
+    "    llm=llm_70b,\n",
+    "    verbose=True,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7e38edc8-47f8-4f1a-ad87-bc3a9e31a65e",
+   "metadata": {},
+   "source": [
+    "### Querying"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "035c2c8b-5a5e-4df0-a423-4c2d6054f457",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = agent.chat(\"Tell me about how Kendrick and Drake grew up\")\n",
+    "print(str(response))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

+ 1 - 1
recipes/quickstart/Getting_to_know_Llama.ipynb

@@ -831,7 +831,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "**Summary: Llama 2 often needs encourgement for step by step thinking to correctly reasoning. Llama 3 understands, reasons and explains better, making chain of thought unnecessary in the cases above.**"
+    "**Summary: Llama 2 often needs encouragement for step by step thinking to correctly reasoning. Llama 3 understands, reasons and explains better, making chain of thought unnecessary in the cases above.**"
    ]
   },
   {

Файловите разлики са ограничени, защото са твърде много
+ 1 - 1
recipes/use_cases/LiveData.ipynb


Файловите разлики са ограничени, защото са твърде много
+ 698 - 0
recipes/use_cases/MediaGen.ipynb


+ 7 - 1
recipes/use_cases/README.md

@@ -14,4 +14,10 @@ This step-by-step tutorial shows how to use the [WhatsApp Business API](https://
 This step-by-step tutorial shows how to use the [Messenger Platform](https://developers.facebook.com/docs/messenger-platform/overview) to build a Llama 3 enabled Messenger chatbot.
 
 ### RAG Chatbot Example (running [locally](./chatbots/RAG_chatbot/RAG_Chatbot_Example.ipynb) or on [OctoAI](../llama_api_providers/OctoAI_API_examples/RAG_Chatbot_example/RAG_Chatbot_Example.ipynb))
-A complete example of how to build a Llama 3 chatbot hosted on your browser that can answer questions based on your own data using retrieval augmented generation (RAG). You can run Llama2 locally if you have a good enough GPU or on OctoAI if you follow the note [here](../README.md#octoai_note).
+A complete example of how to build a Llama 3 chatbot hosted on your browser that can answer questions based on your own data using retrieval augmented generation (RAG). You can run Llama2 locally if you have a good enough GPU or on OctoAI if you follow the note [here](../README.md#octoai_note).
+
+## [Sales Bot](./chatbots/sales_bot/SalesBot.ipynb): Sales Bot with Llama3 - A Summarization and RAG Use Case
+An summarization + RAG use case built around the Amazon product review Kaggle dataset to build a helpful Music Store Sales Bot. The summarization and RAG are built on top of Llama models hosted on OctoAI, and the vector database is hosted on Weaviate Cloud Services.
+
+## [Media Generation](./MediaGen.ipynb): Building a Video Generation Pipeline with Llama3
+This step-by-step tutorial shows how to use leverage Llama 3 to drive the generation of animated videos using SDXL and SVD. More specifically it relies on JSON formatting to produce a scene-by-scene story board of a recipe video. The user provides the name of a dish, then Llama 3 describes a step by step guide to reproduce the said dish. This step by step guide is brought to life with models like SDXL and SVD.

+ 34 - 25
recipes/use_cases/agents/langchain/README.md

@@ -1,61 +1,70 @@
 # LangChain <> Llama3 Cookbooks
 
-LLM agents use [planning, memory, and tools](https://lilianweng.github.io/posts/2023-06-23-agent/) to accomplish tasks.
+LLM agents use [planning, memory, and tools](https://lilianweng.github.io/posts/2023-06-23-agent/) to accomplish tasks. Agents can empower Llama 3 with important new capabilities. Here, we will show how to give Llama 3 the ability to perform web search, as well as multi-modality: image generation (text-to-image), image analysis (image-to-text), and voice (text-to-speech) tools!
 
-LangChain offers several different ways to implement agents.
+LangChain offers several different ways to implement agents with Llama 3:
 
-(1) Use [AgentExecutor](https://python.langchain.com/docs/modules/agents/quick_start/) with [tool-calling](https://python.langchain.com/docs/integrations/chat/) versions of Llama 3.
+(1) `ReAct agent` - Uses [AgentExecutor](https://python.langchain.com/docs/modules/agents/quick_start/) with [tool-calling](https://python.langchain.com/docs/integrations/chat/) versions of Llama 3.
 
-(2) Use [LangGraph](https://python.langchain.com/docs/langgraph), a library from LangChain that can be used to build reliable agents with Llama 3.
+(2) `LangGraph tool calling agent` - Uses [LangGraph](https://python.langchain.com/docs/langgraph) with [tool-calling](https://python.langchain.com/docs/integrations/chat/) versions of Llama 3.
+
+(3) `LangGraph custom agent` - Uses [LangGraph](https://python.langchain.com/docs/langgraph) with **any** version of Llama 3 (so long as it supports structured output).
+
+As we move from option (1) to (3) the degree of customization and flexibility increases:
+
+(1) `ReAct agent` using AgentExecutor is a great for getting started quickly with minimal code, but requires a version of Llama 3 with reliable tool-calling, is the least customizable, and uses higher-level AgentExecutor abstraction.
+  
+(2) `LangGraph tool calling agent` is more customizable than (1) because the LLM assistant (planning) and tool call (action) nodes are defined by the user, but it still requires a version of Llama 3 with reliable tool-calling.
+  
+(3) `LangGraph custom agent` does not require a version of Llama 3 with reliable tool-calling and is the most customizable, but requires the most work to implement. 
+
+![langgraph_agent_architectures](https://github.com/rlancemartin/llama-recipes/assets/122662504/5ed2bef0-ae11-4efa-9e88-ab560a4d0022)
 
 ---
 
-### AgentExecutor Agent
+### `ReAct agent`
 
-AgentExecutor is the runtime for an agent. AgentExecutor calls the agent, executes the actions it chooses, passes the action outputs back to the agent, and repeats.
+The AgentExecutor manages the loop of planning, executing tool calls, and processing outputs until an AgentFinish signal is generated, indicating task completion.
 
 Our first notebook, `tool-calling-agent`, shows how to build a [tool calling agent](https://python.langchain.com/docs/modules/agents/agent_types/tool_calling/) with AgentExecutor and Llama 3.
 
-This shows how to build an agent that uses web search and retrieval tools.
-
 --- 
 
-### LangGraph Agent
+### `LangGraph tool calling agent`
 
 [LangGraph](https://python.langchain.com/docs/langgraph) is a library from LangChain that can be used to build reliable agents.
 
-LangGraph can be used to build agents with a few pieces:
-- **Planning:** Define a control flow of steps that you want the agent to take (a graph)
-- **Memory:** Persist information (graph state) across these steps
-- **Tool use:** Modify state at any step
+Our second notebook, `langgraph-tool-calling-agent`, shows an alternative to AgentExecutor for building a Llama 3 powered agent. 
+
+--- 
 
-Our second notebook, `langgraph-agent`, shows how to build a Llama 3 powered agent that uses web search and retrieval tool in LangGraph.
+### `LangGraph custom agent`
 
-It discusses some of the trade-offs between AgentExecutor and LangGraph.
+Our third notebook, `langgraph-custom-agent`, shows how to build a Llama 3 powered agent without reliance on tool-calling. 
 
 --- 
 
-### LangGraph RAG Agent
+### `LangGraph RAG Agent`
 
-Our third notebook, `langgraph-rag-agent`, shows how to apply LangGraph to build advanced Llama 3 powered RAG agents that use ideas from 3 papers:
+Our fourth notebook, `langgraph-rag-agent`, shows how to apply LangGraph to build a custom Llama 3 powered RAG agent that use ideas from 3 papers:
 
 * Corrective-RAG (CRAG) [paper](https://arxiv.org/pdf/2401.15884.pdf) uses self-grading on retrieved documents and web-search fallback if documents are not relevant.
 * Self-RAG [paper](https://arxiv.org/abs/2310.11511) adds self-grading on generations for hallucinations and for ability to answer the question.
 * Adaptive RAG [paper](https://arxiv.org/abs/2403.14403) routes queries between different RAG approaches based on their complexity.
 
 We implement each approach as a control flow in LangGraph:
-- **Planning:** The sequence of RAG steps (e.g., retrieval, grading, and generation) that we want the agent to take
-- **Memory:** All the RAG-related information (input question, retrieved documents, etc) that we want to pass between steps
-- **Tool use:** All the tools needed for RAG (e.g., decide web search or vectorstore retrieval based on the question)
+- **Planning:** The sequence of RAG steps (e.g., retrieval, grading, and generation) that we want the agent to take.
+- **Memory:** All the RAG-related information (input question, retrieved documents, etc) that we want to pass between steps.
+- **Tool use:** All the tools needed for RAG (e.g., decide web search or vectorstore retrieval based on the question).
 
 We will build from CRAG (blue, below) to Self-RAG (green) and finally to Adaptive RAG (red):
 
-![Screenshot 2024-05-03 at 10 50 02 AM](https://github.com/rlancemartin/llama-recipes/assets/122662504/ec4aa1cd-3c7e-4cd1-a1e7-7deddc4033a8)
+![langgraph_rag_agent_](https://github.com/rlancemartin/llama-recipes/assets/122662504/ec4aa1cd-3c7e-4cd1-a1e7-7deddc4033a8)
 
 --- 
+ 
+### `Local LangGraph RAG Agent`
 
-### Local LangGraph RAG Agent
-
-Our fourth notebook, `langgraph-rag-agent-local`, shows how to apply LangGraph to build advanced RAG agents using Llama 3 that run locally and reliably.
+Our fifth notebook, `langgraph-rag-agent-local`, shows how to apply LangGraph to build advanced RAG agents using Llama 3 that run locally and reliably.
 
-See this [video overview](https://www.youtube.com/watch?v=sgnrL7yo1TE) for more detail.
+See this [video overview](https://www.youtube.com/watch?v=sgnrL7yo1TE) for more detail on the design of this agent.

Файловите разлики са ограничени, защото са твърде много
+ 0 - 698
recipes/use_cases/agents/langchain/langgraph-agent.ipynb


Файловите разлики са ограничени, защото са твърде много
+ 931 - 0
recipes/use_cases/agents/langchain/langgraph-custom-agent.ipynb


Файловите разлики са ограничени, защото са твърде много
+ 831 - 0
recipes/use_cases/agents/langchain/langgraph-tool-calling-agent.ipynb


Файловите разлики са ограничени, защото са твърде много
+ 626 - 82
recipes/use_cases/agents/langchain/tool-calling-agent.ipynb


Файловите разлики са ограничени, защото са твърде много
+ 10262 - 0
recipes/use_cases/chatbots/sales_bot/Musical_instruments_reviews.csv


+ 668 - 0
recipes/use_cases/chatbots/sales_bot/SalesBot.ipynb

@@ -0,0 +1,668 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "374b67d0-b446-4d6f-8e07-59e97716c55a",
+   "metadata": {},
+   "source": [
+    "# Sales Bot with Llama3 - A Summarization and RAG Use Case"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "add4953d-07c3-4480-ad91-7d0ea9c9fb55",
+   "metadata": {},
+   "source": [
+    "## Overview\n",
+    "\n",
+    "In this notebook you'll take an Amazon product reviews dataset from Kaggle and use Llama3 to obtain product review summaries, upsert those summaries in a vector database, then use Retrieval Augmented Generation (RAG) to power a sales chatbot that can make targeted product recommendations.\n",
+    "\n",
+    "Let's take a look at the overall workflow:\n",
+    "1. We start with a dataset that contains over 10,000 reviews across 900 Amazon musical instruments and accessories.\n",
+    "2. Using Llama2 70B chat (hosted on OctoAI), we generate summaries of product reviews for each product from the 20 most recent reviews. We format the summaries in JSON format.\n",
+    "3. We then take the summaries and upsert them into a vector database (Weaviate in this case)\n",
+    "4. We then use this vector database and Llama3 70B instruct (hosted on OctoAI) to build a RAG-based sales chatbot that provides targeted recommendations to the user based on the products that are present in the inventory.\n",
+    "\n",
+    "Note: at the time of writing this tutorial, JSON mode formatting isn't supported for Llama 3 on OctoAI via constrained sampling which is why we are falling back onto Llama 2. This tutorial will be updated when the feature becomes available to rely on Llama 3 exclusively.\n",
+    "\n",
+    "### OctoAI\n",
+    "We'll use [OctoAI](https://octo.ai/) to power all of the GenAI model needs of this notebook: LLMs, image gen, image animation.\n",
+    "* To use OctoAI, you'll need to go to https://octoai.cloud/ and sign in using your Google or GitHub account.\n",
+    "* Next you'll need to generate an OctoAI API token by following these [instructions](https://octo.ai/docs/getting-started/how-to-create-an-octoai-access-token). Keep the API token in hand, we'll need it further down in this notebook.\n",
+    "\n",
+    "In this example we will use the Llama 3 70b instruct model. You can find more on Llama models on the [OctoAI text generation solution page](https://octoai.cloud/text).\n",
+    "\n",
+    "At the time of writing this notebook the following Llama models are available on OctoAI:\n",
+    "* meta-llama-3-8b-instruct\n",
+    "* meta-llama-3-70b-instruct\n",
+    "* codellama-7b-instruct\n",
+    "* codellama-13b-instruct\n",
+    "* codellama-34b-instruct\n",
+    "* llama-2-13b-chat\n",
+    "* llama-2-70b-chat\n",
+    "* llamaguard-7b\n",
+    "\n",
+    "### Weaviate\n",
+    "We'll use Weaviate Cloud Services (WCS) for our vector database. You can create an account and Weaviate clusters easily at the following link: https://console.weaviate.cloud/.\n",
+    "You can then create a cluster, from which you can obtain the REST Endpoint URL and the API key to use the cluster endpoint.\n",
+    "\n",
+    "### OpenAI\n",
+    "We'll be using OpenAI for its embedding model to upsert our vectors into the Weaviate vector database. Create an account and obtain an API key here: https://openai.com/api/\n",
+    "\n",
+    "### Local Python Notebook\n",
+    "We highly recommend launching this notebook from a fresh python environment, for instance you can run the following:\n",
+    "```\n",
+    "python3 -m venv .venv         \n",
+    "source .venv/bin/activate\n",
+    "```\n",
+    "All you need to run this notebook is to install jupyter notebook with `python3 -m pip install notebook` then run `jupyter notebook` ([link](https://jupyter.org/install)) in the same directory as this `.ipynb` file.\n",
+    "You don't need to install additional pip packages ahead of running the notebook, since those will be installed right at the beginning. You will need to ensure your system has `imagemagick` installed by following the [instructions](https://imagemagick.org/script/download.php)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "133c2ea4-0256-49cf-9f5a-a9e5bb0bb63f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Let's start by installing the appropriate python packages\n",
+    "! pip install octoai===1.0.2 openai weaviate-client pandas gradio pydantic"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "75341227-43f8-4a68-b3cb-31e8216f874e",
+   "metadata": {},
+   "source": [
+    "## Part 1: Review Summarization"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "793c06d7-fa67-4c67-a380-081ed3a7a7bf",
+   "metadata": {},
+   "source": [
+    "Let's start by importing all of the packages we need for this example"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "edd366c8-4f0b-4211-83d3-c16e88cbd5c8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import gradio\n",
+    "import json\n",
+    "import langchain\n",
+    "import os\n",
+    "import openai\n",
+    "import weaviate\n",
+    "from getpass import getpass\n",
+    "from json import loads\n",
+    "from pandas import DataFrame, concat, read_csv\n",
+    "from pydantic import BaseModel, Field\n",
+    "from typing import List\n",
+    "import weaviate.classes as wvc"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cd171a7c-c5e7-46d5-8a04-a0f7863609be",
+   "metadata": {},
+   "source": [
+    "Enter your OctoAI, Weaviate, and OpenAI tokens below"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3af09686-a654-45b0-98c5-dee6f30440c7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get OctoAI API token for Llama 2 & 3\n",
+    "OCTOAI_API_TOKEN = getpass()\n",
+    "os.environ[\"OCTOAI_API_TOKEN\"] = OCTOAI_API_TOKEN"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "31c3e684-6e5e-41ad-81d4-970b06522553",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get WCS API key\n",
+    "WCS_API_KEY = getpass()\n",
+    "os.environ[\"WCS_API_KEY\"] = WCS_API_KEY"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a44f7b71-c4f9-4fd6-9a3b-1322c2fd0c35",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get WCS URL\n",
+    "WCS_URL = getpass()\n",
+    "os.environ[\"WCS_URL\"] = WCS_URL"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e4502dfa-c369-4085-a697-fdcda00f970b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get OpenAI API key for the embedding model\n",
+    "OPENAI_API_KEY = getpass()\n",
+    "os.environ[\"OPENAI_API_KEY\"] = OPENAI_API_KEY"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "883986ad-9f60-44d8-ab64-3f566261e055",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# First let's load the dataset from Kaggle: https://www.kaggle.com/datasets/eswarchandt/amazon-music-reviews\n",
+    "df = read_csv('Musical_instruments_reviews.csv')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c05865a7-307a-425e-a6ee-f057d63db77b",
+   "metadata": {},
+   "source": [
+    "Set `product_record_limit` to a lower number if you just want to do a test run"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "22f024e7-3976-425f-b684-8b2c2c1ed191",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Set a product record limit\n",
+    "product_record_limit = 900"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "06554f51-5983-42fc-8a8e-684ae82099db",
+   "metadata": {
+    "scrolled": true
+   },
+   "source": [
+    "# List all of the unique ASIN:\n",
+    "asin_list = df.asin.unique()\n",
+    "print(\"There are {} unique products in the music product inventory\".format(len(asin_list)))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4941baa1-107b-4f39-8d04-1daa5acd465b",
+   "metadata": {},
+   "source": [
+    "For each one of the unique products, let's group the reviews together and sort them by how recent they are"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "38147b91-2425-46a7-b6c0-221173d81024",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get the reviews for the product ASIN, sorted by recency and store in dict\n",
+    "review_dict = {}\n",
+    "for asin in asin_list[0:product_record_limit]:\n",
+    "    reviews = df.loc[df['asin'] == asin]\\\n",
+    "                .sort_values([\"unixReviewTime\"], axis=0, ascending=False)\\\n",
+    "                .reviewText.tolist()\n",
+    "    review_dict[asin] = reviews"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7d5fb78d-808a-4753-abba-4a3066d76ba7",
+   "metadata": {},
+   "source": [
+    "To be able to store our summaries into our vector DB, we need to have the fields formatted into a JSON object. We use Pydantic base class model here to define our formatting."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b786cde1-116a-47eb-8478-3fa2285dcf9d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define the Pydantic model that specifies how our output should be formatted\n",
+    "class ProductRecord(BaseModel):\n",
+    "    \"\"\"The record of a given product\"\"\"\n",
+    "    description: str = Field(description=\"Description of the product\")\n",
+    "    name: str = Field(description=\"Name of the product\")\n",
+    "    review_summary: str = Field(description=\"Summary of all of the reviews\")\n",
+    "    ASIN: str = Field(description=\"ASIN of the product\")\n",
+    "    features: str = Field(description=\"Features of the product based on the reviews\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "08226a6e-f994-454b-9a1d-6246b34bfca2",
+   "metadata": {},
+   "source": [
+    "We define our prompt template below."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1cc3fe69-bf0c-4a50-8d9c-1ae6cb99a9ca",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Prepare a prompt template\n",
+    "template = '''\n",
+    "Here are product reviews for a music product with an ID of {asin}.\n",
+    " - Respond back only as only JSON!\n",
+    " - Provide:\n",
+    "     - the product \"description\",\n",
+    "     - the product \"name\",\n",
+    "     - a summary of all the reviews as \"review_summary\",\n",
+    "     - the \"ASIN\" and\n",
+    "     - and the product \"features\" based on the content of these reviews. \n",
+    " - The \"features\" should be a string describing the features and NOT JSON. \n",
+    " - Do not include the ASIN in the description field.\n",
+    " \n",
+    "The reviews for the product are: {reviews}\n",
+    "'''"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9b8dc3fa-4ad9-4329-96a0-353b05a1c43e",
+   "metadata": {},
+   "source": [
+    "We initialize the OctoAI client using OpenAI's API. All we have to do is override the `base_url` and `api_key`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "57c2ff0a-8029-41a6-a06f-41e560b92230",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Init OctoAI client\n",
+    "client = openai.OpenAI(\n",
+    "    base_url=\"https://text.octoai.run/v1\",\n",
+    "    api_key=os.environ[\"OCTOAI_API_TOKEN\"]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bd0eb425-ceea-4258-a52d-814b7335febb",
+   "metadata": {},
+   "source": [
+    "Iterate over all product ASINs and summarize the top 20 most recent reviews. Note: this takes a while to run unless we parallelize it."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1a55839e-a824-4919-b755-730eaac48d83",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Produce the 900 product summaries\n",
+    "review_summaries = []\n",
+    "counter = 0\n",
+    "\n",
+    "# This can take a while to process serially (30min+)\n",
+    "# TODO: Optimize to run in a few parallel threads to run faster while meeting the 240RPM limit\n",
+    "for asin, review_list in review_dict.items():\n",
+    "    print(f'Getting review summary {counter} of {len(review_dict)}, ASIN: {asin}')\n",
+    "    try:\n",
+    "        response = client.chat.completions.create(\n",
+    "            model=\"llama-2-70b-chat\",\n",
+    "            messages=[\n",
+    "                {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
+    "                {\"role\": \"user\", \"content\": template.format(\n",
+    "                    asin = asin,\n",
+    "                    reviews = review_list[0:20]\n",
+    "                )},\n",
+    "            ],\n",
+    "            temperature=0,\n",
+    "            response_format={\"type\": \"json_object\", \"schema\": ProductRecord.model_json_schema()},\n",
+    "            max_tokens=1024\n",
+    "        )\n",
+    "        print(\"\\n{}\\n\".format(response.choices[0].message.content))\n",
+    "        summary = loads(response.choices[0].message.content)\n",
+    "        summary[\"ASIN\"] = asin\n",
+    "        review_summaries.append(summary)\n",
+    "    except:\n",
+    "        print(f'Issue with ASIN {asin}, skipping')\n",
+    "        pass\n",
+    "    counter += 1\n",
+    "\n",
+    "review_summaries = DataFrame(review_summaries)\n",
+    "\n",
+    "print(review_summaries)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4772d1c1-c9c4-466e-9c80-259804a4286b",
+   "metadata": {},
+   "source": [
+    "# Part 2: Retrieval Augmented Generation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ccd97408-d47f-46f3-b601-f66f8a3b20ff",
+   "metadata": {},
+   "source": [
+    "For our RAG use case we're going to rely on Weaviate vector database and on an OpenAI embedding model. \n",
+    "\n",
+    "When you define your collection, you'll need to provide properties, i.e. object attributes that you want to store in the collection. These properties map 1:1 to the JSON dictionary keys defined earlier for the `ProductRecord` Pydantic base model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5dad98ec-531d-4fc2-aed9-9f337b957feb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Connect to WCS\n",
+    "wcs_client = weaviate.connect_to_wcs(\n",
+    "    cluster_url=os.getenv(\"WCS_URL\"),\n",
+    "    auth_credentials=weaviate.auth.AuthApiKey(os.getenv(\"WCS_API_KEY\")),\n",
+    "    headers={\n",
+    "        \"X-OpenAI-Api-Key\": os.environ[\"OPENAI_API_KEY\"]\n",
+    "    }\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "02953f7b-0149-4c13-a7cc-c4dd1da45d43",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create the collection if it doesn't already exist\n",
+    "try:\n",
+    "    collection = wcs_client.collections.get(\"Products\")\n",
+    "except:\n",
+    "    # Create the collection for products\n",
+    "    collection = wcs_client.collections.create(\n",
+    "        name=\"Products\",\n",
+    "        vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_openai(),\n",
+    "        properties=[\n",
+    "            wvc.config.Property(\n",
+    "                name=\"ASIN\",\n",
+    "                data_type=wvc.config.DataType.TEXT\n",
+    "            ),\n",
+    "             wvc.config.Property(\n",
+    "                name=\"name\",\n",
+    "                data_type=wvc.config.DataType.TEXT\n",
+    "            ),\n",
+    "             wvc.config.Property(\n",
+    "                name=\"review_summary\",\n",
+    "                data_type=wvc.config.DataType.TEXT\n",
+    "            ),\n",
+    "             wvc.config.Property(\n",
+    "                name=\"features\",\n",
+    "                data_type=wvc.config.DataType.TEXT\n",
+    "            ),\n",
+    "             wvc.config.Property(\n",
+    "                name=\"description\",\n",
+    "                data_type=wvc.config.DataType.TEXT\n",
+    "            ),\n",
+    "        ]\n",
+    "    )\n",
+    "    print(\"Collection Created!\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1551fd74-b143-4c02-9b56-364d33683fd3",
+   "metadata": {},
+   "source": [
+    "Now we upsert all of the vectors into the databse using OpenAI's embedding model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "53f779e7-b875-4a19-9f9c-74b45992608e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Convert df to JSON string and then to a list of dictionaries\n",
+    "data = review_summaries.to_json(orient='records')\n",
+    "data_list = json.loads(data)\n",
+    "\n",
+    "items_to_insert = []\n",
+    "\n",
+    "for d in data_list:\n",
+    "    new_item = {\n",
+    "        \"ASIN\": d[\"ASIN\"],\n",
+    "        \"name\": d[\"name\"],\n",
+    "        \"description\": d[\"description\"],  \\\n",
+    "        \"features\": d[\"features\"],\n",
+    "        \"review_summary\": d[\"review_summary\"]\n",
+    "    }\n",
+    "    items_to_insert.append(new_item)\n",
+    "\n",
+    "    # Insert every 100 items\n",
+    "    if len(items_to_insert) == 100:\n",
+    "        collection.data.insert_many(items_to_insert)\n",
+    "        items_to_insert.clear()\n",
+    "\n",
+    "# Insert remaining items\n",
+    "if len(items_to_insert) > 0:\n",
+    "    collection.data.insert_many(items_to_insert)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "35079318-41a5-46fc-8475-5d728550fb88",
+   "metadata": {},
+   "source": [
+    "Let's now try to run a hybrid search on the following query below.\n",
+    "Hybrid search combines the results of a vector search and a keyword (BM25F) search by fusing the two result sets.\n",
+    "It will return the 3 closest entries in the database according to the search criteria."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5f707954-c36b-4a83-874b-f817bd33c39a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Hybrid search\n",
+    "response = collection.query.hybrid(\n",
+    "    query=\"easy to learn instrument\",\n",
+    "    limit=3\n",
+    ")\n",
+    "for o in response.objects:\n",
+    "    print(o.properties)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "04d39507-5e8e-4374-a33c-53e57db6ef99",
+   "metadata": {},
+   "source": [
+    "Let's now define a helper function that gives us the relevant context given a string query. Let's see what it returns based on the question: \"What is a good beginner harmonica\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a1ca51c7-83e5-4896-acc9-753060592ba0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Helper function to run hybrid search on a user query and return the closest\n",
+    "# product review summaries relevant to the user query\n",
+    "def get_context(question, limit=3):\n",
+    "    response = collection.query.hybrid(\n",
+    "        query=question,\n",
+    "        limit=limit\n",
+    "    )\n",
+    "    return \"\\n\".join([str(o.properties) for o in response.objects])\n",
+    "\n",
+    "print(get_context(\"What is a good beginner harmonica\"))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "677f534c-8be4-4b6b-82d9-2df8e2ad12d4",
+   "metadata": {},
+   "source": [
+    "Great, we're now ready to build a sales assistant helper function.\n",
+    "\n",
+    "We first define a prompt template for Llama 3 - based on the context provided by the vector hybrid search (i.e. collection of product summaries of relevance to the question), provide a helpful recommendation to the customer. \n",
+    "\n",
+    "Also provide links to the product that the user can click on to view the product on Amazon's website. For that we use the fact that any product referenced by its aSIN can be accessed at the following url: `https://www.amazon.com/exec/obidos/ASIN/<insert aSIN here>`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "856d021a-add5-48f4-a09c-258d2a617095",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sales_template = \"\"\"\n",
+    "You are a sales assistant. Answer the user questions as helpfully as possible.\n",
+    "Only recommend the products that are provided in the context provided below.\n",
+    "\n",
+    "Provide a reference to each product you mention with hyperlinks:\n",
+    "* Provide the name of the product\n",
+    "* Embed the hyperlink in the name of the product as follows\n",
+    "    * If the product name is \"Solid Electric Guitar Case with Accessories Compartment\"\n",
+    "    * And the aSIN is \"B001EL6I8W\"\n",
+    "    * Format the reference as follows: \n",
+    "         [Solid Electric Guitar Case with Accessories Compartment](https://www.amazon.com/exec/obidos/ASIN/B001EL6I8W)\n",
+    "\n",
+    "Finish with a references section.\n",
+    "\n",
+    "Customer question: {}\n",
+    "\n",
+    "Product context: {}\n",
+    "\n",
+    "AI:\n",
+    "\"\"\"\n",
+    "\n",
+    "def sales_assistant(question):  \n",
+    "    response = client.chat.completions.create(\n",
+    "                model=\"meta-llama-3-70b-instruct\",\n",
+    "                messages=[\n",
+    "                    {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
+    "                    {\"role\": \"user\", \"content\": sales_template.format(question, get_context(question, limit=10))},\n",
+    "                ],\n",
+    "                temperature=0,\n",
+    "                max_tokens=1024\n",
+    "            )\n",
+    "    \n",
+    "    return response.choices[0].message.content\n",
+    "\n",
+    "print(sales_assistant(\"what is must have accessory for my new electric guitar\"))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "faccba14-9216-4420-b6c5-ddf4029d7904",
+   "metadata": {},
+   "source": [
+    "# Part 3: Gradio-based sales assistant demo"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3e2b73b5-6bdf-4c87-b044-2690fd52605f",
+   "metadata": {},
+   "source": [
+    "In this section we build a simple an interactive sales bot assistant using Gradio."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "53805acb-3e8d-40fa-8045-c589cb14eadd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import gradio as gr\n",
+    "\n",
+    "def predict(message, history):\n",
+    "    history_openai_format = []\n",
+    "    for human, assistant in history:\n",
+    "        history_openai_format.append({\"role\": \"user\", \"content\": human})\n",
+    "        history_openai_format.append({\"role\": \"assistant\", \"content\": assistant})\n",
+    "    history_openai_format.append({\"role\": \"user\", \"content\": sales_template.format(message, get_context(message, limit=5))})\n",
+    "\n",
+    "    response = client.chat.completions.create(\n",
+    "        model = 'meta-llama-3-70b-instruct',\n",
+    "        messages = history_openai_format,\n",
+    "        temperature = 0.0,\n",
+    "        stream = True\n",
+    "     )\n",
+    "\n",
+    "    partial_message = \"\"\n",
+    "    for chunk in response:\n",
+    "        if chunk.choices[0].delta.content is not None:\n",
+    "              partial_message = partial_message + chunk.choices[0].delta.content\n",
+    "              yield partial_message\n",
+    "\n",
+    "gr.ChatInterface(predict).launch()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6d4e65fe-0246-40b7-adb6-9091cccbc486",
+   "metadata": {},
+   "source": [
+    "**Authors**\n",
+    "- Thierry Moreau, OctoAI - tmoreau@octo.ai\n",
+    "- Jonathan Tuite, Weaviate - jon@weaviate.io"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

Файловите разлики са ограничени, защото са твърде много
+ 1261 - 0
recipes/use_cases/llamaindex_cookbook.ipynb


+ 3 - 2
src/llama_recipes/inference/llm.py

@@ -187,6 +187,7 @@ class OctoAI(LLM):
     @override
     def valid_models(self) -> list[str]:
         return [
-            "llamaguard-7b",
-            "llama-2-13b-chat",
+            "llamaguard-2-8b",
+            "meta-llama-3-8b-instruct",
+            "meta-llama-3-70b-instruct",        
         ]

+ 1 - 1
src/llama_recipes/utils/config_utils.py

@@ -34,7 +34,7 @@ def update_config(config, **kwargs):
                     if hasattr(config, param_name):
                         setattr(config, param_name, v)
                     else:
-                        # In case of specialized config we can warm user
+                        # In case of specialized config we can warn user
                         print(f"Warning: {config_name} does not accept parameter: {k}")
             elif isinstance(config, train_config):
                 print(f"Warning: unknown parameter {k}")

tests/conftest.py → src/tests/conftest.py


+ 4 - 1
tests/datasets/test_custom_dataset.py

@@ -33,6 +33,7 @@ def check_padded_entry(batch, tokenizer):
     assert batch["input_ids"][0][-1] == tokenizer.eos_token_id
 
 
+@pytest.mark.skip(reason="Flakey due to random dataset order @todo fix order")
 @pytest.mark.skip_missing_tokenizer
 @patch('llama_recipes.finetuning.train')
 @patch('llama_recipes.finetuning.AutoTokenizer')
@@ -45,6 +46,7 @@ def test_custom_dataset(step_lr, optimizer, get_model, tokenizer, train, mocker,
     setup_tokenizer(tokenizer)
 
     skip_special_tokens = llama_version == "meta-llama/Llama-2-7b-hf"
+    get_model.return_value.get_input_embeddings.return_value.weight.shape = [32000 if "Llama-2" in llama_version else 128256]
 
     kwargs = {
         "dataset": "custom_dataset",
@@ -98,10 +100,11 @@ def test_custom_dataset(step_lr, optimizer, get_model, tokenizer, train, mocker,
 @patch('llama_recipes.finetuning.AutoTokenizer.from_pretrained')
 @patch('llama_recipes.finetuning.optim.AdamW')
 @patch('llama_recipes.finetuning.StepLR')
-def test_unknown_dataset_error(step_lr, optimizer, tokenizer, get_model, train, mocker):
+def test_unknown_dataset_error(step_lr, optimizer, tokenizer, get_model, train, mocker, llama_version):
     from llama_recipes.finetuning import main
 
     tokenizer.return_value = mocker.MagicMock(side_effect=lambda x: {"input_ids":[len(x)*[0,]], "attention_mask": [len(x)*[0,]]})
+    get_model.return_value.get_input_embeddings.return_value.weight.shape = [32000 if "Llama-2" in llama_version else 128256]
 
     kwargs = {
         "dataset": "custom_dataset",

+ 1 - 0
tests/datasets/test_grammar_datasets.py

@@ -26,6 +26,7 @@ def test_grammar_dataset(step_lr, optimizer, get_model, tokenizer, train, setup_
     from llama_recipes.finetuning import main
 
     setup_tokenizer(tokenizer)
+    get_model.return_value.get_input_embeddings.return_value.weight.shape = [32000 if "Llama-2" in llama_version else 128256]
 
     BATCH_SIZE = 8
     kwargs = {

+ 1 - 0
tests/datasets/test_samsum_datasets.py

@@ -26,6 +26,7 @@ def test_samsum_dataset(step_lr, optimizer, get_model, tokenizer, train, mocker,
     from llama_recipes.finetuning import main
 
     setup_tokenizer(tokenizer)
+    get_model.return_value.get_input_embeddings.return_value.weight.shape = [32000 if "Llama-2" in llama_version else 128256]
 
     BATCH_SIZE = 8
     kwargs = {

+ 3 - 1
tests/test_batching.py

@@ -25,7 +25,8 @@ def test_packing(step_lr, optimizer, get_model, tokenizer, train, setup_tokenize
     from llama_recipes.finetuning import main
 
     setup_tokenizer(tokenizer)
-
+    get_model.return_value.get_input_embeddings.return_value.weight.shape = [32000 if "Llama-2" in llama_version else 128256]
+    
     kwargs = {
         "model_name": llama_version,
         "batch_size_training": 8,
@@ -72,6 +73,7 @@ def test_distributed_packing(dist, is_initialized, fsdp, setup, step_lr, optimiz
     from llama_recipes.finetuning import main
 
     setup_tokenizer(tokenizer)
+    get_model.return_value.get_input_embeddings.return_value.weight.shape = [32000 if "Llama-2" in llama_version else 128256]
 
     rank = 1
     os.environ['LOCAL_RANK'] = f'{rank}'

+ 2 - 1
tests/test_chat_completion.py

@@ -7,7 +7,7 @@ import pytest
 import torch
 from llama_recipes.inference.chat_utils import read_dialogs_from_file
 
-ROOT_DIR = Path(__file__).parents[1]
+ROOT_DIR = Path(__file__).parents[2]
 CHAT_COMPLETION_DIR = ROOT_DIR / "recipes/inference/local_inference/chat_completion/"
 
 sys.path = [CHAT_COMPLETION_DIR.as_posix()] + sys.path
@@ -107,6 +107,7 @@ def test_chat_completion(
     from chat_completion import main
 
     setup_tokenizer(tokenizer)
+    load_model.return_value.get_input_embeddings.return_value.weight.shape = [32000 if "Llama-2" in llama_version else 128256]
 
     kwargs = {
         "prompt_file": (CHAT_COMPLETION_DIR / "chats.json").as_posix(),

tests/test_finetuning.py → src/tests/test_finetuning.py


tests/test_finetuning_data_formatter.py → src/tests/test_finetuning_data_formatter.py


tests/test_sampler.py → src/tests/test_sampler.py


+ 1 - 0
tests/test_train_utils.py

@@ -103,6 +103,7 @@ def test_save_to_json(temp_output_dir, mocker):
     train_config.max_train_step = 0
     train_config.max_eval_step = 0
     train_config.output_dir = temp_output_dir
+    train_config.use_profiler = False
 
     results = train(
         model,