瀏覽代碼

Merge branch 'main' into raft

Kai Wu 9 月之前
父節點
當前提交
91fd013a9d
共有 75 個文件被更改,包括 13872 次插入364 次删除
  1. 25 0
      .github/scripts/spellcheck_conf/wordlist.txt
  2. 21 12
      README.md
  3. 1 3
      docs/LLM_finetuning.md
  4. 19 12
      docs/multi_gpu.md
  5. 5 4
      docs/single_gpu.md
  6. 1 1
      pyproject.toml
  7. 8 2
      recipes/3p_integrations/README.md
  8. 9 9
      recipes/3p_integrations/lamini/text2sql_memory_tuning/meta_lamini.ipynb
  9. 1 1
      recipes/3p_integrations/lamini/text2sql_memory_tuning/util/parse_arguments.py
  10. 9 9
      recipes/3p_integrations/llama_on_prem.md
  11. 448 0
      recipes/3p_integrations/llamaindex/dlai_agentic_rag/Building_Agentic_RAG_with_Llamaindex_L2_Tool_Calling.ipynb
  12. 4253 0
      recipes/3p_integrations/llamaindex/dlai_agentic_rag/Building_Agentic_RAG_with_Llamaindex_L3_Building_an_Agent_Reasoning_Loop.ipynb
  13. 4418 0
      recipes/3p_integrations/llamaindex/dlai_agentic_rag/Building_Agentic_RAG_with_Llamaindex_L4_Building_a_Multi-Document_Agent.ipynb
  14. 11 0
      recipes/3p_integrations/llamaindex/dlai_agentic_rag/README.md
  15. 0 0
      recipes/3p_integrations/octoai/MediaGen.ipynb
  16. 75 0
      recipes/3p_integrations/vllm/README.md
  17. 35 12
      recipes/3p_integrations/vllm/inference.py
  18. 4 1
      recipes/README.md
  19. 5 5
      recipes/quickstart/README.md
  20. 2 2
      recipes/quickstart/Running_Llama3_Anywhere/Running_Llama_on_HF_transformers.ipynb
  21. 314 0
      recipes/quickstart/agents/dlai/AI_Agentic_Design_Patterns_with_AutoGen_L4_Tool_Use_and_Conversational_Chess.ipynb
  22. 784 0
      recipes/quickstart/agents/dlai/AI_Agents_in_LangGraph_L1_Build_an_Agent_from_Scratch.ipynb
  23. 355 0
      recipes/quickstart/agents/dlai/Building_Agentic_RAG_with_Llamaindex_L1_Router_Engine.ipynb
  24. 581 0
      recipes/quickstart/agents/dlai/Functions_Tools_and_Agents_with_LangChain_L1_Function_Calling.ipynb
  25. 11 0
      recipes/quickstart/agents/dlai/README.md
  26. 4 6
      recipes/quickstart/finetuning/LLM_finetuning_overview.md
  27. 3 3
      recipes/quickstart/finetuning/README.md
  28. 3 3
      recipes/quickstart/finetuning/datasets/README.md
  29. 38 5
      recipes/quickstart/finetuning/multigpu_finetuning.md
  30. 1 1
      recipes/quickstart/finetuning/quickstart_peft_finetuning.ipynb
  31. 8 5
      recipes/quickstart/finetuning/singlegpu_finetuning.md
  32. 4 3
      recipes/quickstart/inference/README.md
  33. 10 7
      recipes/quickstart/inference/local_inference/README.md
  34. 13 11
      recipes/quickstart/inference/local_inference/chat_completion/chat_completion.py
  35. 144 128
      recipes/quickstart/inference/local_inference/inference.py
  36. 51 0
      recipes/quickstart/inference/modelUpgradeExample.py
  37. 10 7
      recipes/responsible_ai/README.md
  38. 8 8
      recipes/responsible_ai/llama_guard/README.md
  39. 2 2
      recipes/responsible_ai/llama_guard/inference.py
  40. 793 0
      recipes/responsible_ai/llama_guard/llama_guard_customization_via_prompting_and_fine_tuning.ipynb
  41. 11 0
      recipes/responsible_ai/prompt_guard/README.md
  42. 0 0
      recipes/responsible_ai/prompt_guard/__init__.py
  43. 180 0
      recipes/responsible_ai/prompt_guard/inference.py
  44. 817 0
      recipes/responsible_ai/prompt_guard/prompt_guard_tutorial.ipynb
  45. 3 6
      recipes/use_cases/README.md
  46. 1 1
      recipes/use_cases/customerservice_chatbots/RAG_chatbot/RAG_Chatbot_Example.ipynb
  47. 3 3
      recipes/use_cases/customerservice_chatbots/RAG_chatbot/vectorstore/mongodb/rag_mongodb_llama3_huggingface_open_source.ipynb
  48. 0 0
      recipes/use_cases/customerservice_chatbots/ai_agent_chatbot/SalesBot.ipynb
  49. 0 0
      recipes/use_cases/customerservice_chatbots/ai_agent_chatbot/musical_instruments_reviews.csv
  50. 0 0
      recipes/use_cases/customerservice_chatbots/messenger_chatbot/llama_messenger.py
  51. 2 2
      recipes/use_cases/customerservice_chatbots/messenger_llama/messenger_llama3.md
  52. 0 0
      recipes/use_cases/customerservice_chatbots/whatsapp_chatbot/llama_chatbot.py
  53. 1 1
      recipes/use_cases/customerservice_chatbots/whatsapp_llama/whatsapp_llama3.md
  54. 2 1
      requirements.txt
  55. 1 0
      src/llama_recipes/configs/__init__.py
  56. 7 2
      src/llama_recipes/configs/datasets.py
  57. 30 0
      src/llama_recipes/configs/quantization.py
  58. 1 1
      src/llama_recipes/configs/training.py
  59. 2 1
      src/llama_recipes/datasets/__init__.py
  60. 131 0
      src/llama_recipes/datasets/toxicchat_dataset.py
  61. 33 42
      src/llama_recipes/finetuning.py
  62. 22 5
      src/llama_recipes/inference/model_utils.py
  63. 95 5
      src/llama_recipes/inference/prompt_format_utils.py
  64. 1 1
      src/llama_recipes/inference/safety_utils.py
  65. 4 4
      src/llama_recipes/utils/hf_llama_conversion/README.md
  66. 7 5
      src/llama_recipes/utils/hf_llama_conversion/compare_llama_weights.py
  67. 14 8
      src/llama_recipes/tools/convert_hf_weights_to_llama.py
  68. 3 0
      src/llama_recipes/utils/dataset_utils.py
  69. 1 1
      src/tests/conftest.py
  70. 1 1
      src/tests/datasets/test_custom_dataset.py
  71. 1 1
      src/tests/datasets/test_grammar_datasets.py
  72. 1 1
      src/tests/datasets/test_samsum_datasets.py
  73. 1 1
      src/tests/test_batching.py
  74. 2 2
      tools/benchmarks/inference/on_prem/README.md
  75. 7 7
      tools/benchmarks/llm_eval_harness/README.md

+ 25 - 0
.github/scripts/spellcheck_conf/wordlist.txt

@@ -1351,6 +1351,12 @@ Weaviate
 MediaGen
 SDXL
 SVD
+QLORA
+Agentic
+AutoGen
+DeepLearning
+Deeplearning
+Llamaindex
 KV
 KVs
 XSUM
@@ -1407,3 +1413,22 @@ numRefusal
 totalQA
 DirectoryLoader
 SitemapLoader
+nf
+quant
+DLAI
+agentic
+containts
+dlai
+Prerequirements
+tp
+QLoRA
+ntasks
+srun
+xH
+unquantized
+eom
+ipython
+CPUs
+modelUpgradeExample
+guardrailing
+

文件差異過大導致無法顯示
+ 21 - 12
README.md


+ 1 - 3
docs/LLM_finetuning.md

@@ -1,6 +1,6 @@
 ## LLM Fine-Tuning
 
-Here we discuss fine-tuning Meta Llama 3 with a couple of different recipes. We will cover two scenarios here:
+Here we discuss fine-tuning Meta Llama with a couple of different recipes. We will cover two scenarios here:
 
 
 ## 1. **Parameter Efficient Model Fine-Tuning**
@@ -18,8 +18,6 @@ These methods will address three aspects:
 
 HF [PEFT](https://github.com/huggingface/peft) library provides an easy way of using these methods which we make use of here. Please read more [here](https://huggingface.co/blog/peft).
 
-
-
 ## 2. **Full/ Partial Parameter Fine-Tuning**
 
 Full parameter fine-tuning has its own advantages, in this method there are multiple strategies that can help:

+ 19 - 12
docs/multi_gpu.md

@@ -6,13 +6,12 @@ To run fine-tuning on multi-GPUs, we will  make use of two packages:
 
 2. [FSDP](https://pytorch.org/tutorials/intermediate/FSDP_adavnced_tutorial.html) which helps us parallelize the training over multiple GPUs. [More details](LLM_finetuning.md/#2-full-partial-parameter-finetuning).
 
-Given the combination of PEFT and FSDP, we would be able to fine tune a Meta Llama 3 8B model on multiple GPUs in one node or multi-node.
+Given the combination of PEFT and FSDP, we would be able to fine tune a Meta Llama 8B model on multiple GPUs in one node.
+For big models like 405B we will need to fine-tune in a multi-node setup even if 4bit quantization is enabled.
 
 ## Requirements
 To run the examples, make sure to install the llama-recipes package and clone the github repository in order to use the provided [`finetuning.py`](../recipes/quickstart/finetuning/finetuning.py) script with torchrun (See [README.md](../README.md) for details).
 
-**Please note that the llama_recipes package will install PyTorch 2.0.1 version, in case you want to run FSDP + PEFT, please make sure to install PyTorch nightlies.**
-
 ## How to run it
 
 Get access to a machine with multiple GPUs ( in this case we tested with 4 A100 and A10s).
@@ -24,7 +23,7 @@ This runs with the `samsum_dataset` for summarization application by default.
 
 ```bash
 
-torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --use_peft --peft_method lora --output_dir Path/to/save/PEFT/model
+torchrun --nnodes 1 --nproc_per_node 4  recipes/quickstart/finetuning/finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --use_peft --peft_method lora --output_dir Path/to/save/PEFT/model
 
 ```
 
@@ -43,7 +42,7 @@ We use `torchrun` here to spawn multiple processes for FSDP.
 Setting `use_fast_kernels` will enable using of Flash Attention or Xformer memory-efficient kernels based on the hardware being used. This would speed up the fine-tuning job. This has been enabled in `optimum` library from HuggingFace as a one-liner API, please read more [here](https://pytorch.org/blog/out-of-the-box-acceleration/).
 
 ```bash
-torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --use_peft --peft_method lora --output_dir Path/to/save/PEFT/model --use_fast_kernels
+torchrun --nnodes 1 --nproc_per_node 4  recipes/quickstart/finetuning/finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --use_peft --peft_method lora --output_dir Path/to/save/PEFT/model --use_fast_kernels
 ```
 
 ### Fine-tuning using FSDP Only
@@ -52,8 +51,16 @@ If interested in running full parameter finetuning without making use of PEFT me
 
 ```bash
 
-torchrun --nnodes 1 --nproc_per_node 8  examples/finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --use_fast_kernels
+torchrun --nnodes 1 --nproc_per_node 8  recipes/quickstart/finetuning/finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --fsdp_config.pure_bf16 --use_fast_kernels
+
+```
+
+### Fine-tuning using FSDP + QLORA
+
+This has been tested on 4 H100s GPUs.
 
+```bash
+ FSDP_CPU_RAM_EFFICIENT_LOADING=1 ACCELERATE_USE_FSDP=1 torchrun --nnodes 1 --nproc_per_node 4  finetuning.py --enable_fsdp  --quantization 4bit --model_name /path_of_model_folder/70B  --mixed_precision False --low_cpu_fsdp --use_peft --peft_method lora --output_dir Path/to/save/PEFT/model
 ```
 
 ### Fine-tuning using FSDP on 70B Model
@@ -62,7 +69,7 @@ If you are interested in running full parameter fine-tuning on the 70B model, yo
 
 ```bash
 
-torchrun --nnodes 1 --nproc_per_node 8 examples/finetuning.py --enable_fsdp --low_cpu_fsdp --pure_bf16 --model_name /path_of_model_folder/70B --batch_size_training 1 --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned
+torchrun --nnodes 1 --nproc_per_node 8 recipes/quickstart/finetuning/finetuning.py --enable_fsdp --low_cpu_fsdp --fsdp_config.pure_bf16 --model_name /path_of_model_folder/70B --batch_size_training 1 --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned
 
 ```
 
@@ -72,7 +79,7 @@ Here we use a slurm script to schedule a job with slurm over multiple nodes.
 
 ```bash
 
-sbatch examples/multi_node.slurm
+sbatch recipes/quickstart/finetuning/multi_node.slurm
 # Change the num nodes and GPU per nodes in the script before running.
 
 ```
@@ -95,16 +102,16 @@ To run with each of the datasets set the `dataset` flag in the command as shown
 
 ```bash
 # grammer_dataset
-torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp  --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset grammar_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned  --pure_bf16 --output_dir Path/to/save/PEFT/model
+torchrun --nnodes 1 --nproc_per_node 4  recipes/quickstart/finetuning/finetuning.py --enable_fsdp  --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset grammar_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned  --fsdp_config.pure_bf16 --output_dir Path/to/save/PEFT/model
 
 # alpaca_dataset
 
-torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp  --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset alpaca_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --output_dir Path/to/save/PEFT/model
+torchrun --nnodes 1 --nproc_per_node 4  recipes/quickstart/finetuning/finetuning.py --enable_fsdp  --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset alpaca_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --fsdp_config.pure_bf16 --output_dir Path/to/save/PEFT/model
 
 
 # samsum_dataset
 
-torchrun --nnodes 1 --nproc_per_node 4  examples/finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset samsum_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --output_dir Path/to/save/PEFT/model
+torchrun --nnodes 1 --nproc_per_node 4  recipes/quickstart/finetuning/finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset samsum_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --fsdp_config.pure_bf16 --output_dir Path/to/save/PEFT/model
 
 ```
 
@@ -182,7 +189,7 @@ It lets us specify the training settings for everything from `model_name` to `da
 
 * `fsdp_activation_checkpointing` enables activation checkpoining for FSDP, this saves significant amount of memory with the trade off of recomputing itermediate activations during the backward pass. The saved memory can be re-invested in higher batch sizes to increase the throughput. We recommond you use this option.
 
-* `pure_bf16` it moves the  model to `BFloat16` and if `optimizer` is set to `anyprecision` then optimizer states will be kept in `BFloat16` as well. You can use this option if necessary.
+* `fsdp_config.pure_bf16` it moves the  model to `BFloat16` and if `optimizer` is set to `anyprecision` then optimizer states will be kept in `BFloat16` as well. You can use this option if necessary.
 
 ## FLOPS Counting and Pytorch Profiling
 

+ 5 - 4
docs/single_gpu.md

@@ -17,10 +17,11 @@ To run the examples, make sure to install the llama-recipes package (See [README
 
 Get access to a machine with one GPU or if using a multi-GPU machine please make sure to only make one of them visible using `export CUDA_VISIBLE_DEVICES=GPU:id` and run the following. It runs by default with `samsum_dataset` for summarization application.
 
+**NOTE** To run the fine-tuning with `QLORA`, make sure to set `--peft_method lora` and `--quantization int4`.
 
 ```bash
 
-python -m llama_recipes.finetuning  --use_peft --peft_method lora --quantization --use_fp16 --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model
+python -m llama_recipes.finetuning  --use_peft --peft_method lora --quantization 8bit --use_fp16 --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model
 
 ```
 The args used in the command above are:
@@ -51,16 +52,16 @@ to run with each of the datasets set the `dataset` flag in the command as shown
 ```bash
 # grammer_dataset
 
-python -m llama_recipes.finetuning  --use_peft --peft_method lora --quantization  --dataset grammar_dataset --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model
+python -m llama_recipes.finetuning  --use_peft --peft_method lora --quantization 8bit --dataset grammar_dataset --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model
 
 # alpaca_dataset
 
-python -m llama_recipes.finetuning  --use_peft --peft_method lora --quantization  --dataset alpaca_dataset --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model
+python -m llama_recipes.finetuning  --use_peft --peft_method lora --quantization 8bit --dataset alpaca_dataset --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model
 
 
 # samsum_dataset
 
-python -m llama_recipes.finetuning  --use_peft --peft_method lora --quantization  --dataset samsum_dataset --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model
+python -m llama_recipes.finetuning  --use_peft --peft_method lora --quantization 8bit --dataset samsum_dataset --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model
 
 ```
 

+ 1 - 1
pyproject.toml

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "llama-recipes"
-version = "0.0.2"
+version = "0.0.3"
 authors = [
   { name="Hamid Shojanazeri", email="hamidnazeri@meta.com" },
   { name="Matthias Reso", email="mreso@meta.com" },

+ 8 - 2
recipes/3p_integrations/README.md

@@ -1,2 +1,8 @@
-## [Running Llama 3 On-Prem with vLLM and TGI](llama_on_prem.md)
-This tutorial shows how to use Llama 3 with [vLLM](https://github.com/vllm-project/vllm) and Hugging Face [TGI](https://github.com/huggingface/text-generation-inference) to build Llama 3 on-prem apps.
+## Llama-Recipes 3P Integrations
+
+This folder contains example scripts showcasing the use of Meta Llama with popular platforms and tooling in the LLM ecosystem. 
+
+Each folder is maintained by the platform-owner. 
+
+> [!NOTE]
+> If you'd like to add your platform here, please open a new issue with details of your examples.

+ 9 - 9
recipes/3p_integrations/lamini/text2sql_memory_tuning/meta_lamini.ipynb

@@ -145,7 +145,7 @@
     "class Args:\n",
     "    def __init__(self, \n",
     "                 max_examples=100, \n",
-    "                 sql_model_name=\"meta-llama/Meta-Llama-3-8B-Instruct\", \n",
+    "                 sql_model_name=\"meta-llama/Meta-Llama-3.1-8B-Instruct\", \n",
     "                 gold_file_name=\"gold-test-set.jsonl\",\n",
     "                 training_file_name=\"generated_queries.jsonl\",\n",
     "                 num_to_generate=10):\n",
@@ -197,7 +197,7 @@
     }
    ],
    "source": [
-    "llm = lamini.Lamini(model_name=\"meta-llama/Meta-Llama-3-8B-Instruct\")\n",
+    "llm = lamini.Lamini(model_name=\"meta-llama/Meta-Llama-3.1-8B-Instruct\")\n",
     "\n",
     "question = \"\"\"Who is the highest paid NBA player?\"\"\"\n",
     "system = f\"\"\"You are an NBA analyst with 15 years of experience writing complex SQL queries. Consider the nba_roster table with the following schema:\n",
@@ -418,7 +418,7 @@
     "class ScoreStage(GenerationNode):\n",
     "    def __init__(self):\n",
     "        super().__init__(\n",
-    "            model_name=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n",
+    "            model_name=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
     "            max_new_tokens=150,\n",
     "        )\n",
     "\n",
@@ -712,7 +712,7 @@
     "class ModelStage(GenerationNode):\n",
     "    def __init__(self):\n",
     "        super().__init__(\n",
-    "            model_name=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n",
+    "            model_name=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
     "            max_new_tokens=300,\n",
     "        )\n",
     "\n",
@@ -808,7 +808,7 @@
     "class QuestionStage(GenerationNode):\n",
     "    def __init__(self):\n",
     "        super().__init__(\n",
-    "            model_name=\"meta-llama/Meta-Llama-3-8B-Instruct\",\n",
+    "            model_name=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
     "            max_new_tokens=150,\n",
     "        )\n",
     "\n",
@@ -1055,7 +1055,7 @@
    ],
    "source": [
     "args = Args()\n",
-    "llm = lamini.Lamini(model_name=\"meta-llama/Meta-Llama-3-8B-Instruct\")\n",
+    "llm = lamini.Lamini(model_name=\"meta-llama/Meta-Llama-3.1-8B-Instruct\")\n",
     "\n",
     "dataset = get_dataset(args, make_question)\n",
     "finetune_args = get_default_finetune_args()\n",
@@ -1601,7 +1601,7 @@
    ],
    "source": [
     "args = Args(training_file_name=\"archive/generated_queries_large_filtered_cleaned.jsonl\")\n",
-    "llm = lamini.Lamini(model_name=\"meta-llama/Meta-Llama-3-8B-Instruct\")\n",
+    "llm = lamini.Lamini(model_name=\"meta-llama/Meta-Llama-3.1-8B-Instruct\")\n",
     "\n",
     "dataset = get_dataset(args, make_question)\n",
     "finetune_args = get_default_finetune_args()\n",
@@ -1798,7 +1798,7 @@
    ],
    "source": [
     "args = Args(training_file_name=\"generated_queries_v2.jsonl\")\n",
-    "llm = lamini.Lamini(model_name=\"meta-llama/Meta-Llama-3-8B-Instruct\")\n",
+    "llm = lamini.Lamini(model_name=\"meta-llama/Meta-Llama-3.1-8B-Instruct\")\n",
     "\n",
     "dataset = get_dataset(args, make_question)\n",
     "finetune_args = get_default_finetune_args()\n",
@@ -1966,7 +1966,7 @@
    ],
    "source": [
     "args = Args(training_file_name=\"archive/generated_queries_v2_large_filtered_cleaned.jsonl\")\n",
-    "llm = lamini.Lamini(model_name=\"meta-llama/Meta-Llama-3-8B-Instruct\")\n",
+    "llm = lamini.Lamini(model_name=\"meta-llama/Meta-Llama-3.1-8B-Instruct\")\n",
     "\n",
     "dataset = get_dataset(args, make_question)\n",
     "finetune_args = get_default_finetune_args()\n",

+ 1 - 1
recipes/3p_integrations/lamini/text2sql_memory_tuning/util/parse_arguments.py

@@ -16,7 +16,7 @@ def parse_arguments():
     parser.add_argument(
         "--sql-model-name",
         type=str,
-        default="meta-llama/Meta-Llama-3-8B-Instruct",
+        default="meta-llama/Meta-Llama-3.1-8B-Instruct",
         help="The model to use for text2sql",
         required=False,
     )

+ 9 - 9
recipes/3p_integrations/llama_on_prem.md

@@ -8,7 +8,7 @@ We'll use the Amazon EC2 instance running Ubuntu with an A10G 24GB GPU as an exa
 
 The Colab notebook to connect via LangChain with Llama 3 hosted as the vLLM and TGI API services is [here](https://colab.research.google.com/drive/1rYWLdgTGIU1yCHmRpAOB2D-84fPzmOJg), also shown in the sections below.
 
-This tutorial assumes that you you have been granted access to the Meta Llama 3 on Hugging Face - you can open a Hugging Face Meta model page [here](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) to confirm that you see "Gated model You have been granted access to this model"; if you see "You need to agree to share your contact information to access this model", simply complete and submit the form in the page.
+This tutorial assumes that you you have been granted access to the Meta Llama 3 on Hugging Face - you can open a Hugging Face Meta model page [here](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) to confirm that you see "Gated model You have been granted access to this model"; if you see "You need to agree to share your contact information to access this model", simply complete and submit the form in the page.
 
 You'll also need your Hugging Face access token which you can get at your Settings page [here](https://huggingface.co/settings/tokens).
 
@@ -33,7 +33,7 @@ There are two ways to deploy Llama 3 via vLLM, as a general API server or an Ope
 Run the command below to deploy vLLM as a general Llama 3 service:
 
 ```
-python -m vllm.entrypoints.api_server --host 0.0.0.0 --port 5000 --model meta-llama/Meta-Llama-3-8B-Instruct
+python -m vllm.entrypoints.api_server --host 0.0.0.0 --port 5000 --model meta-llama/Meta-Llama-3.1-8B-Instruct
 ```
 
 Then on another terminal you can run:
@@ -68,13 +68,13 @@ Also, if you have multiple GPUs, you can add the `--tensor-parallel-size` argume
 git clone https://github.com/vllm-project/vllm
 cd vllm/vllm/entrypoints
 conda activate llama3
-python api_server.py --host 0.0.0.0 --port 5000 --model meta-llama/Meta-Llama-3-8B-Instruct --tensor-parallel-size 4
+python api_server.py --host 0.0.0.0 --port 5000 --model meta-llama/Meta-Llama-3.1-8B-Instruct --tensor-parallel-size 4
 ```
 
 With multiple GPUs, you can also run replica of models as long as your model size can fit into targeted GPU memory. For example, if you have two A10G with 24 GB memory, you can run two Llama 3 8B models at the same time. This can be done by launching two api servers each targeting specific CUDA cores on different ports:
-`CUDA_VISIBLE_DEVICES=0 python api_server.py --host 0.0.0.0 --port 5000  --model meta-llama/Meta-Llama-3-8B-Instruct`
+`CUDA_VISIBLE_DEVICES=0 python api_server.py --host 0.0.0.0 --port 5000  --model meta-llama/Meta-Llama-3.1-8B-Instruct`
 and
-`CUDA_VISIBLE_DEVICES=1 python api_server.py --host 0.0.0.0 --port 5001  --model meta-llama/Meta-Llama-3-8B-Instruct`
+`CUDA_VISIBLE_DEVICES=1 python api_server.py --host 0.0.0.0 --port 5001  --model meta-llama/Meta-Llama-3.1-8B-Instruct`
 The benefit would be that you can balance incoming requests to both models, reaching higher batch size processing for a trade-off of generation latency.
 
 
@@ -83,14 +83,14 @@ The benefit would be that you can balance incoming requests to both models, reac
 You can also deploy the vLLM hosted Llama 3 as an OpenAI-Compatible service to easily replace code using OpenAI API. First, run the command below:
 
 ```
-python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 5000 --model meta-llama/Meta-Llama-3-8B-Instruct
+python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 5000 --model meta-llama/Meta-Llama-3.1-8B-Instruct
 ```
 
 Then on another terminal, run:
 
 ```
 curl http://localhost:5000/v1/completions -H "Content-Type: application/json" -d '{
-        "model": "meta-llama/Meta-Llama-3-8B-Instruct",
+        "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
         "prompt": "Who wrote the book Innovators dilemma?",
         "max_tokens": 300,
         "temperature": 0
@@ -118,7 +118,7 @@ from langchain.llms import VLLMOpenAI
 llm = VLLMOpenAI(
     openai_api_key="EMPTY",
     openai_api_base="http://<vllm_server_ip_address>:5000/v1",
-    model_name="meta-llama/Meta-Llama-3-8B-Instruct",
+    model_name="meta-llama/Meta-Llama-3.1-8B-Instruct",
 )
 
 print(llm("Who wrote the book godfather?"))
@@ -136,7 +136,7 @@ You can now use the Llama 3 instance `llm` created this way in any of the demo a
 The easiest way to deploy Llama 3 with TGI is using its official docker image. First, replace `<your_hugging_face_access_token>` and set the three required shell variables (you may replace the `model` value above with another Llama 3 model):
 
 ```
-model=meta-llama/Meta-Llama-3-8B-Instruct
+model=meta-llama/Meta-Llama-3.1-8B-Instruct
 volume=$PWD/data
 token=<your_hugging_face_access_token>
 ```

+ 448 - 0
recipes/3p_integrations/llamaindex/dlai_agentic_rag/Building_Agentic_RAG_with_Llamaindex_L2_Tool_Calling.ipynb

@@ -0,0 +1,448 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<a href=\"https://colab.research.google.com/github/meta-llama/llama-recipes/blob/main/recipes/3p_integrations/llamaindex/dlai_agentic_rag/Building_Agentic_RAG_with_Llamaindex_L2_Tool_Calling.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
+    "\n",
+    "This notebook ports the DeepLearning.AI short course [Building Agentic RAG with Llamaindex Lesson 2 Tool Calling](https://learn.deeplearning.ai/courses/building-agentic-rag-with-llamaindex/lesson/3/tool-calling) to using Llama 3. It shows how to use Llama 3 to not only pick a function to execute, but also infer an argument to pass through the function.\n",
+    "\n",
+    "You should take the course before or after going through this notebook to have a deeper understanding.\n",
+    "\n",
+    "Note: Unlike Lesson 1 where we use Llama 3 70b on [Groq](https://groq.com/), this lesson uses Llama 3 on [Fireworks.ai](https://fireworks.ai/) to overcome the rate limit issue with Groq on some summary tool calling."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "eiJsOa29ej7G",
+    "outputId": "edc5d39c-f379-4410-db9f-998db9c099be"
+   },
+   "outputs": [],
+   "source": [
+    "!pip install llama-index\n",
+    "!pip install llama-index-embeddings-huggingface\n",
+    "!pip install llama-index-llms-fireworks"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "NZ9l6k_3WncE"
+   },
+   "outputs": [],
+   "source": [
+    "import nest_asyncio\n",
+    "\n",
+    "nest_asyncio.apply()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "QkaALpnIQ01b"
+   },
+   "outputs": [],
+   "source": [
+    "from llama_index.core.tools import FunctionTool\n",
+    "\n",
+    "def add(x: int, y: int) -> int:\n",
+    "    \"\"\"Adds two integers together.\"\"\"\n",
+    "    return x + y\n",
+    "\n",
+    "def mystery(x: int, y: int) -> int:\n",
+    "    \"\"\"Mystery function that operates on top of two numbers.\"\"\"\n",
+    "    return (x + y) * (x + y)\n",
+    "\n",
+    "\n",
+    "add_tool = FunctionTool.from_defaults(fn=add)\n",
+    "mystery_tool = FunctionTool.from_defaults(fn=mystery)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os \n",
+    "\n",
+    "os.environ['FIREWORKS_API_KEY'] = 'xxx' # get a free key at https://fireworks.ai/api-keys"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "mA3AG6CFQ3fj",
+    "outputId": "b872d91f-3a16-4d40-cacb-59c8ba5b5bde"
+   },
+   "outputs": [],
+   "source": [
+    "from llama_index.llms.fireworks import Fireworks\n",
+    "\n",
+    "# Llama 3 8b on Fireworks.ai also works in some cases, but 70b works better overall\n",
+    "#llm = Fireworks(model=\"accounts/fireworks/models/llama-v3-8b-instruct\", temperature=0)\n",
+    "llm = Fireworks(model=\"accounts/fireworks/models/llama-v3-70b-instruct\", temperature=0)\n",
+    "\n",
+    "# a quick sanity test\n",
+    "#llm.complete(\"Who wrote the  book godfather? \").text\n",
+    "\n",
+    "response = llm.predict_and_call(\n",
+    "    [add_tool, mystery_tool],\n",
+    "    \"Tell me the output of the mystery function on 2 and 9\",\n",
+    "    verbose=True\n",
+    ")\n",
+    "print(str(response))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "8TTkd6vuUMmh",
+    "outputId": "c2c419af-e9d1-48bb-aa51-c785dcdee3a0"
+   },
+   "outputs": [],
+   "source": [
+    "!wget \"https://openreview.net/pdf?id=VtmBAGCN7o\" -O metagpt.pdf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "auZQalH5J7CU"
+   },
+   "outputs": [],
+   "source": [
+    "from llama_index.core import SimpleDirectoryReader\n",
+    "\n",
+    "# https://arxiv.org/pdf/2308.00352 metagpt.pdf\n",
+    "documents = SimpleDirectoryReader(input_files=[\"metagpt.pdf\"]).load_data()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "GFfUjJypJ7Eq"
+   },
+   "outputs": [],
+   "source": [
+    "from llama_index.core.node_parser import SentenceSplitter\n",
+    "splitter = SentenceSplitter(chunk_size=1024)\n",
+    "nodes = splitter.get_nodes_from_documents(documents)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "FrqorjH3VHmT",
+    "outputId": "b4888caf-0623-4d64-dba1-4d74c12e64f3"
+   },
+   "outputs": [],
+   "source": [
+    "print(nodes[0].get_content(metadata_mode=\"all\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "9WisqWK4VPCZ"
+   },
+   "outputs": [],
+   "source": [
+    "from llama_index.core import Settings, VectorStoreIndex\n",
+    "from llama_index.embeddings.huggingface import HuggingFaceEmbedding\n",
+    "\n",
+    "Settings.llm = llm\n",
+    "\n",
+    "Settings.embed_model = HuggingFaceEmbedding(\n",
+    "    model_name=\"BAAI/bge-small-en-v1.5\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "YS4e0mzsVKsl"
+   },
+   "outputs": [],
+   "source": [
+    "# Settings.llm and embed_model apply to which call below? VectorStoreIndex(), as_query_engine?\n",
+    "\n",
+    "from llama_index.core import VectorStoreIndex\n",
+    "\n",
+    "vector_index = VectorStoreIndex(nodes)\n",
+    "query_engine = vector_index.as_query_engine(similarity_top_k=2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "S7tz2Z28VKv1"
+   },
+   "outputs": [],
+   "source": [
+    "from llama_index.core.vector_stores import MetadataFilters\n",
+    "\n",
+    "query_engine = vector_index.as_query_engine(\n",
+    "    similarity_top_k=2,\n",
+    "    filters=MetadataFilters.from_dicts(\n",
+    "        [\n",
+    "            {\"key\": \"page_label\", \"value\": \"2\"}\n",
+    "        ]\n",
+    "    )\n",
+    ")\n",
+    "\n",
+    "response = query_engine.query(\n",
+    "    \"What are some high-level results of MetaGPT?\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "CttWxW8aVKyk",
+    "outputId": "4b64a64f-a989-4ee0-f08e-a6d6f5db42b6"
+   },
+   "outputs": [],
+   "source": [
+    "print(str(response))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "ZvQGoUR0VK1I",
+    "outputId": "9033f46f-baba-4345-bd6c-29ce4db3ea39"
+   },
+   "outputs": [],
+   "source": [
+    "for n in response.source_nodes:\n",
+    "    print(n.metadata)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "5r1MHbLOPT8Y"
+   },
+   "outputs": [],
+   "source": [
+    "from typing import List\n",
+    "from llama_index.core.vector_stores import FilterCondition\n",
+    "\n",
+    "\n",
+    "def vector_query(\n",
+    "    query: str,\n",
+    "    page_numbers: List[str]\n",
+    ") -> str:\n",
+    "    \"\"\"Perform a vector search over an index.\n",
+    "\n",
+    "    query (str): the string query to be embedded.\n",
+    "    page_numbers (List[str]): Filter by set of pages. Leave BLANK if we want to perform a vector search\n",
+    "        over all pages. Otherwise, filter by the set of specified pages.\n",
+    "\n",
+    "    \"\"\"\n",
+    "\n",
+    "    metadata_dicts = [\n",
+    "        {\"key\": \"page_label\", \"value\": p} for p in page_numbers\n",
+    "    ]\n",
+    "\n",
+    "    query_engine = vector_index.as_query_engine(\n",
+    "        similarity_top_k=2,\n",
+    "        filters=MetadataFilters.from_dicts(\n",
+    "            metadata_dicts,\n",
+    "            condition=FilterCondition.OR\n",
+    "        )\n",
+    "    )\n",
+    "    response = query_engine.query(query)\n",
+    "    return response\n",
+    "\n",
+    "\n",
+    "vector_query_tool = FunctionTool.from_defaults(\n",
+    "    name=\"vector_tool\",\n",
+    "    fn=vector_query\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "2jMB3iS6VjFg",
+    "outputId": "4ecd2d26-e159-4765-9aa5-818c5308ae02"
+   },
+   "outputs": [],
+   "source": [
+    "response = llm.predict_and_call(\n",
+    "    [vector_query_tool],\n",
+    "    \"What are the high-level results of MetaGPT as described on page 2?\",\n",
+    "    verbose=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "so2p09VNVm9I",
+    "outputId": "8fd7027b-e356-492c-decf-36340ad90978"
+   },
+   "outputs": [],
+   "source": [
+    "for n in response.source_nodes:\n",
+    "    print(n.metadata)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "AuxhlFxHV4MV"
+   },
+   "outputs": [],
+   "source": [
+    "from llama_index.core import SummaryIndex\n",
+    "from llama_index.core.tools import QueryEngineTool\n",
+    "\n",
+    "summary_index = SummaryIndex(nodes)\n",
+    "summary_query_engine = summary_index.as_query_engine(\n",
+    "    response_mode=\"tree_summarize\",\n",
+    "    use_async=True,\n",
+    ")\n",
+    "summary_tool = QueryEngineTool.from_defaults(\n",
+    "    name=\"summary_tool\",\n",
+    "    query_engine=summary_query_engine,\n",
+    "    description=(\n",
+    "        \"Useful if you want to get a summary of MetaGPT\"\n",
+    "    ),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "dSqh6iLkV61F",
+    "outputId": "9dc5e545-9f9d-4a7d-8332-beb158574b8e"
+   },
+   "outputs": [],
+   "source": [
+    "response = llm.predict_and_call(\n",
+    "    [vector_query_tool, summary_tool],\n",
+    "    \"What are the MetaGPT comparisons with ChatDev described on page 8?\",\n",
+    "    verbose=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "OsPn62A2V8R7",
+    "outputId": "cf74caa0-5f59-4f3c-f806-f996b317df8c"
+   },
+   "outputs": [],
+   "source": [
+    "for n in response.source_nodes:\n",
+    "    print(n.metadata)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "ugdDXz1EV96J",
+    "outputId": "fdc93775-04a0-4632-f190-0ef8d3800651"
+   },
+   "outputs": [],
+   "source": [
+    "response = llm.predict_and_call(\n",
+    "    [vector_query_tool, summary_tool],\n",
+    "    \"What is a summary of the paper?\",\n",
+    "    verbose=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "fx0RBeX0OS3n"
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

文件差異過大導致無法顯示
+ 4253 - 0
recipes/3p_integrations/llamaindex/dlai_agentic_rag/Building_Agentic_RAG_with_Llamaindex_L3_Building_an_Agent_Reasoning_Loop.ipynb


文件差異過大導致無法顯示
+ 4418 - 0
recipes/3p_integrations/llamaindex/dlai_agentic_rag/Building_Agentic_RAG_with_Llamaindex_L4_Building_a_Multi-Document_Agent.ipynb


+ 11 - 0
recipes/3p_integrations/llamaindex/dlai_agentic_rag/README.md

@@ -0,0 +1,11 @@
+# Building Agentic RAG with Llamaindex
+
+The folder here containts the Llama 3 ported notebooks of the DLAI short course [Building Agentic RAG with Llamaindex](https://www.deeplearning.ai/short-courses/building-agentic-rag-with-llamaindex/).
+
+1. [Building Agentic RAG with Llamaindex L1 Router Engine](../../../quickstart/agents/dlai/Building_Agentic_RAG_with_Llamaindex_L1_Router_Engine.ipynb) shows how to implement a simple agentic RAG, a router that will pick up one of several query tools (question answering or summarization) to execute a query on a single document. Note this notebook is located in the `quickstart` folder.
+
+2. [Building Agentic RAG with Llamaindex L2 Tool Calling](Building_Agentic_RAG_with_Llamaindex_L2_Tool_Calling.ipynb) shows how to use Llama 3 to not only pick a function to execute, but also infer an argument to pass through the function.
+
+3. [Building Agentic RAG with Llamaindex L3 Building an Agent Reasoning Loop](Building_Agentic_RAG_with_Llamaindex_L3_Building_an_Agent_Reasoning_Loop.ipynb) shows how to define a complete agent reasoning loop to reason over tools and multiple steps on a complex question the user asks about a single document while maintaining memory.
+
+3. [Building Agentic RAG with Llamaindex L4 Building a Multi-Document Agent](Building_Agentic_RAG_with_Llamaindex_L4_Building_a_Multi-Document_Agent.ipynb) shows how to use an agent to handle multiple documents and increasing degrees of complexity.

recipes/use_cases/MediaGen.ipynb → recipes/3p_integrations/octoai/MediaGen.ipynb


+ 75 - 0
recipes/3p_integrations/vllm/README.md

@@ -0,0 +1,75 @@
+# Llama inference with vLLM
+
+This folder contains an example for running Llama inference on multiple-gpus in single- as well as multi-node scenarios using vLLM.
+
+## Prerequirements
+
+To run this example we will need to install vLLM as well as ray in case multi-node inference is the goal.
+
+```bash
+pip install vllm
+
+# For multi-node inference we also need to install ray
+pip install ray[default]
+```
+
+For the following examples we will assume that we fine-tuned a base model using the LoRA method and we have setup the following environment variables pointing to the base model as well as LoRA adapter:
+
+```bash
+export MODEL_PATH=/path/to/out/base/model
+export PEFT_MODEL_PATH=/path/to/out/peft/model
+```
+
+## Single-node multi-gpu inference
+To launch the inference simply execute the following command changing the tp_size parameter to the numbers of GPUs you have available:
+
+``` bash
+python inference.py --model_name $MODEL_PATH --peft_model_name $PEFT_MODEL_PATH --tp_size 8 --user_prompt "Hello my name is"
+```
+The script will ask for another prompt ina loop after completing the generation which you can exit by simply pressing enter and leaving the prompt empty.
+When using multiple gpus the model will automatically be split accross the available GPUs using tensor parallelism.
+
+## Multi-node multi-gpu inference
+The FP8 quantized variants of Meta Llama (i.e. meta-llama/Meta-Llama-3.1-405B-FP8 and meta-llama/Meta-Llama-3.1-405B-Instruct-FP8) can be executed on a single node with 8x80GB H100 using the script located in this folder.
+To run the unquantized Meta Llama 405B variants (i.e. meta-llama/Meta-Llama-3.1-405B and meta-llama/Meta-Llama-3.1-405B-Instruct) we need multi-node inference.
+vLLM allows this by leveraging pipeline parallelism accros nodes while still applying tensor parallelism insid each node.
+To start a multi-node inference we first need to set up a ray serves which well be leveraged by vLLM to execute the model across node boundaries.
+
+```bash
+# On the head node we start the clustr as follows
+ray start --head
+
+# After the server starts it prints out a couple of lines including the command to add nodes to the cluster e.g.:
+# To add another node to this Ray cluster, run
+#   ray start --address='<head-node-ip-address>:6379'
+# Where the head node ip address will depend on your environment
+
+# We can then add the worker nodes by executing the command in a shell on the worker node
+ray start --address='<head-node-ip-address>:6379'
+
+# We can check if the cluster was launched successfully by executing this on any node
+ray status
+
+# It should show the number of nodes we have added as well as the head node
+# Node status
+# ---------------------------------------------------------------
+# Active:
+#  1 node_82143b740a25228c24dc8bb3a280b328910b2fcb1987eee52efb838b
+#  1 node_3f2c673530de5de86f953771538f35437ab60e3cacd7730dbca41719
+```
+
+To launch the inference we can then execute the inference script while we adapt pp_size and tp_size to our environment.
+
+```
+pp_size - number of worker + head nodes
+
+tp_size - number of GPUs per node
+```
+
+If our environment consists of two nodes with 8 GPUs each we would execute:
+```bash
+python inference.py --model_name $MODEL_PATH --peft_model_name $PEFT_MODEL_PATH --pp_size 2 --tp_size 8 --user_prompt "Hello my name is"
+```
+
+The launch of the vLLM engine will take some time depending on your environment as each worker will need to load the checkpoint files to extract its fraction of the weights.
+and even if it seem to hang

+ 35 - 12
recipes/3p_integrations/vllm/inference.py

@@ -1,11 +1,13 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
 
+import uuid
+import asyncio
 import fire
 
 import torch
-from vllm import LLM
-from vllm import LLM, SamplingParams
+from vllm import AsyncLLMEngine, AsyncEngineArgs, SamplingParams
+from vllm.lora.request import LoRARequest
 from accelerate.utils import is_xpu_available
 
 if is_xpu_available():
@@ -15,13 +17,24 @@ else:
 
 torch.manual_seed(42)
 
-def load_model(model_name, tp_size=1):
+def load_model(model_name, peft_model=None, pp_size=1, tp_size=1):
+    additional_configs = {}
+    if peft_model:
+        additional_configs["enable_lora"] = True
+        
+    engine_config = AsyncEngineArgs(
+        model=model_name,
+        pipeline_parallel_size=pp_size,
+        tensor_parallel_size=tp_size,
+        max_loras=1,
+        **additional_configs)
 
-    llm = LLM(model_name, tensor_parallel_size=tp_size)
+    llm = AsyncLLMEngine.from_engine_args(engine_config)
     return llm
 
-def main(
+async def main(
     model,
+    peft_model_name=None,
     max_new_tokens=100,
     user_prompt=None,
     top_p=0.9,
@@ -35,26 +48,36 @@ def main(
 
         print(f"sampling params: top_p {top_p} and temperature {temperature} for this inference request")
         sampling_param = SamplingParams(top_p=top_p, temperature=temperature, max_tokens=max_new_tokens)
-        
 
-        outputs = model.generate(user_prompt, sampling_params=sampling_param)
+        lora_request = None
+        if peft_model_name:
+            lora_request = LoRARequest("lora",0,peft_model_name)
+
+        req_id = str(uuid.uuid4())
+
+        generator = model.generate(user_prompt, sampling_param, req_id, lora_request=lora_request)
+        output = None
+        async for request_output in generator:
+            output = request_output
    
-        print(f"model output:\n {user_prompt} {outputs[0].outputs[0].text}")
+        print(f"model output:\n {user_prompt} {output.outputs[0].text}")
         user_prompt = input("Enter next prompt (press Enter to exit): ")
         if not user_prompt:
             break
 
 def run_script(
     model_name: str,
-    peft_model=None,
-    tp_size=1,
+    peft_model_name=None,
+    pp_size : int = 1,
+    tp_size : int = 1,
     max_new_tokens=100,
     user_prompt=None,
     top_p=0.9,
     temperature=0.8
 ):
-    model = load_model(model_name, tp_size)
-    main(model, max_new_tokens, user_prompt, top_p, temperature)
+    model = load_model(model_name, peft_model_name, pp_size, tp_size)
+
+    asyncio.get_event_loop().run_until_complete(main(model, peft_model_name, max_new_tokens, user_prompt, top_p, temperature))
 
 if __name__ == "__main__":
     fire.Fire(run_script)

+ 4 - 1
recipes/README.md

@@ -1,8 +1,11 @@
+## Llama-Recipes
+
 This folder contains examples organized by topic:
 
 | Subfolder | Description |
 |---|---|
 [quickstart](./quickstart)|The "Hello World" of using Llama 3, start here if you are new to using Llama 3
 [use_cases](./use_cases)|Scripts showing common applications of Llama 3
-[3p_integrations](./3p_integrations)|Partner owned folder showing common applications of Meta Llama3
+[3p_integrations](./3p_integrations)|Partner-owned folder showing Meta Llama 3 usage along with third-party tools 
 [responsible_ai](./responsible_ai)|Scripts to use PurpleLlama for safeguarding model outputs
+[experimental](./experimental)|Meta Llama implementations of experimental LLM techniques

+ 5 - 5
recipes/quickstart/README.md

@@ -2,11 +2,11 @@
 
 If you are new to developing with Meta Llama models, this is where you should start. This folder contains introductory-level notebooks across different techniques relating to Meta Llama.
 
-* The [](./Running_Llama3_Anywhere/) notebooks demonstrate how to run Llama inference across Linux, Mac and Windows platforms using the appropriate tooling.
-* The [](./Prompt_Engineering_with_Llama_3.ipynb) notebook showcases the various ways to elicit appropriate outputs from Llama. Take this notebook for a spin to get a feel for how Llama responds to different inputs and generation parameters.
-* The [](./inference/) folder contains scripts to deploy Llama for inference on server and mobile. See also [](../3p_integrations/vllm/) and [](../3p_integrations/tgi/) for hosting Llama on open-source model servers.
-* The [](./RAG/) folder contains a simple Retrieval-Augmented Generation application using Llama 3.
-* The [](./finetuning/) folder contains resources to help you finetune Llama 3 on your custom datasets, for both single- and multi-GPU setups. The scripts use the native llama-recipes finetuning code found in [](../../src/llama_recipes/finetuning.py) which supports these features:
+* The [Running_Llama3_Anywhere](./Running_Llama3_Anywhere/) notebooks demonstrate how to run Llama inference across Linux, Mac and Windows platforms using the appropriate tooling.
+* The [Prompt_Engineering_with_Llama_3](./Prompt_Engineering_with_Llama_3.ipynb) notebook showcases the various ways to elicit appropriate outputs from Llama. Take this notebook for a spin to get a feel for how Llama responds to different inputs and generation parameters.
+* The [inference](./inference/) folder contains scripts to deploy Llama for inference on server and mobile. See also [3p_integrations/vllm](../3p_integrations/vllm/) and [3p_integrations/tgi](../3p_integrations/tgi/) for hosting Llama on open-source model servers.
+* The [RAG](./RAG/) folder contains a simple Retrieval-Augmented Generation application using Llama 3.
+* The [finetuning](./finetuning/) folder contains resources to help you finetune Llama 3 on your custom datasets, for both single- and multi-GPU setups. The scripts use the native llama-recipes finetuning code found in [finetuning.py](../../src/llama_recipes/finetuning.py) which supports these features:
 
 | Feature                                        |   |
 | ---------------------------------------------- | - |

+ 2 - 2
recipes/quickstart/Running_Llama3_Anywhere/Running_Llama_on_HF_transformers.ipynb

@@ -92,7 +92,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Then, we will set the model variable to a specific model we’d like to use. In this demo, we will use the 8b chat model `meta-llama/Meta-Llama-3-8B-Instruct`. Using Meta models from Hugging Face requires you to\n",
+    "Then, we will set the model variable to a specific model we’d like to use. In this demo, we will use the 8b chat model `meta-llama/Meta-Llama-3.1-8B-Instruct`. Using Meta models from Hugging Face requires you to\n",
     "\n",
     "1. Accept Terms of Service for Meta Llama 3 on Meta [website](https://llama.meta.com/llama-downloads).\n",
     "2. Use the same email address from Step (1) to login into Hugging Face.\n",
@@ -125,7 +125,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "model = \"meta-llama/Meta-Llama-3-8B-Instruct\"\n",
+    "model = \"meta-llama/Meta-Llama-3.1-8B-Instruct\"\n",
     "tokenizer = AutoTokenizer.from_pretrained(model)"
    ]
   },

+ 314 - 0
recipes/quickstart/agents/dlai/AI_Agentic_Design_Patterns_with_AutoGen_L4_Tool_Use_and_Conversational_Chess.ipynb

@@ -0,0 +1,314 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "7a4b75bb-d60a-41e3-abca-1ca0f0bf1201",
+   "metadata": {},
+   "source": [
+    "<a href=\"https://colab.research.google.com/github/meta-llama/llama-recipes/blob/main/recipes/quickstart/agents/dlai/AI_Agentic_Design_Patterns_with_AutoGen_L4_Tool_Use_and_Conversational_Chess.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "51581f90-911f-46ef-82dd-f3ca9c1d4b96",
+   "metadata": {},
+   "source": [
+    "This notebook ports the DeepLearning.AI short course [AI Agentic Design Patterns with AutoGen Lesson 4 Tool Use and Conversational Chess](https://learn.deeplearning.ai/courses/ai-agentic-design-patterns-with-autogen/lesson/5/tool-use-and-conversational-chess) to using Llama 3. \n",
+    "\n",
+    "You should take the course before or after going through this notebook to have a deeper understanding."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f9824ea5-3791-4638-a09d-43eb2c906d79",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "!pip install chess\n",
+    "!pip install pyautogen"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a082a6dc-ceb1-4a3e-b3ae-afcb835de6da",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import chess\n",
+    "import chess.svg\n",
+    "from typing_extensions import Annotated"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fbcdd9ea-f589-463d-a306-3fb3fcde770c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "board = chess.Board()\n",
+    "\n",
+    "made_move = False"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9d27858c-4a0b-40f6-bd58-01b19c33ab38",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_legal_moves(\n",
+    "    \n",
+    ") -> Annotated[str, \"A list of legal moves in UCI format\"]:\n",
+    "    return \"Possible moves are: \" + \",\".join(\n",
+    "        [str(move) for move in board.legal_moves]\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "67742daa-9d9a-46b3-9466-beb96d535334",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from IPython.display import SVG\n",
+    "\n",
+    "def make_move(\n",
+    "    move: Annotated[str, \"A move in UCI format.\"]\n",
+    ") -> Annotated[str, \"Result of the move.\"]:\n",
+    "    move = chess.Move.from_uci(move)\n",
+    "    board.push_uci(str(move))\n",
+    "    global made_move\n",
+    "    made_move = True\n",
+    "    \n",
+    "    svg_str = chess.svg.board(\n",
+    "            board,\n",
+    "            arrows=[(move.from_square, move.to_square)],\n",
+    "            fill={move.from_square: \"gray\"},\n",
+    "            size=200\n",
+    "        )\n",
+    "    display(\n",
+    "        SVG(data=svg_str)\n",
+    "    )\n",
+    "    \n",
+    "    # Get the piece name.\n",
+    "    piece = board.piece_at(move.to_square)\n",
+    "    piece_symbol = piece.unicode_symbol()\n",
+    "    piece_name = (\n",
+    "        chess.piece_name(piece.piece_type).capitalize()\n",
+    "        if piece_symbol.isupper()\n",
+    "        else chess.piece_name(piece.piece_type)\n",
+    "    )\n",
+    "    return f\"Moved {piece_name} ({piece_symbol}) from \"\\\n",
+    "    f\"{chess.SQUARE_NAMES[move.from_square]} to \"\\\n",
+    "    f\"{chess.SQUARE_NAMES[move.to_square]}.\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e84508c0-0465-4be8-a97b-2e702265bcfb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# base url from https://console.groq.com/docs/openai\n",
+    "config_list = [\n",
+    "    {\n",
+    "        \"model\": \"llama3-70b-8192\",\n",
+    "        \"base_url\": \"https://api.groq.com/openai/v1\",\n",
+    "        'api_key': 'your_groq_api_key', # get a free key at https://console.groq.com/keys\n",
+    "    },\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "86dbb782-61f0-4b61-aab5-41fd12c26f51",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from autogen import ConversableAgent\n",
+    "\n",
+    "# Player white agent\n",
+    "player_white = ConversableAgent(\n",
+    "    name=\"Player White\",\n",
+    "    system_message=\"You are a chess player and you play as white. \"\n",
+    "    \"First call get_legal_moves(), to get a list of legal moves in UCI format. \"\n",
+    "    \"Then call make_move(move) to make a move. Finally, tell the proxy what you have moved and ask the black to move\", # added \"Finally...\" to make the agents work\n",
+    "    llm_config={\"config_list\": config_list,\n",
+    "                \"temperature\": 0,\n",
+    "               },\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1c57411c-183a-44ea-95ab-33c0e97feb74",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Player black agent\n",
+    "player_black = ConversableAgent(\n",
+    "    name=\"Player Black\",\n",
+    "    system_message=\"You are a chess player and you play as black. \"\n",
+    "    \"First call get_legal_moves(), to get a list of legal moves in UCI format. \"\n",
+    "    \"Then call make_move(move) to make a move. Finally, tell the proxy what you have moved and ask the white to move\", # added \"Finally...\" to make the agents work\n",
+    "    llm_config={\"config_list\": config_list,\n",
+    "                \"temperature\": 0,\n",
+    "               },)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "60e5cb2d-4273-45a9-af40-0ffb1ada0009",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def check_made_move(msg):\n",
+    "    global made_move\n",
+    "    if made_move:\n",
+    "        made_move = False\n",
+    "        return True\n",
+    "    else:\n",
+    "        return False\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "be4c7b55-9d50-4aa8-ae4b-3b959ffbb298",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "board_proxy = ConversableAgent(\n",
+    "    name=\"Board Proxy\",\n",
+    "    llm_config=False,\n",
+    "    is_termination_msg=check_made_move,\n",
+    "    default_auto_reply=\"Please make a move.\",\n",
+    "    human_input_mode=\"NEVER\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e122875c-8bff-4212-8a1b-5f91d253fdd7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from autogen import register_function"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "20edcb8c-5b7b-438e-b476-1cb16d14ef62",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for caller in [player_white, player_black]:\n",
+    "    register_function(\n",
+    "        get_legal_moves,\n",
+    "        caller=caller,\n",
+    "        executor=board_proxy,\n",
+    "        name=\"get_legal_moves\",\n",
+    "        description=\"Call this tool to get all legal moves in UCI format.\",\n",
+    "    )\n",
+    "    \n",
+    "    register_function(\n",
+    "        make_move,\n",
+    "        caller=caller,\n",
+    "        executor=board_proxy,\n",
+    "        name=\"make_move\",\n",
+    "        description=\"Call this tool to make a move.\",\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b254ea02-0a81-4e9f-91fa-788dead9ffb8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "player_black.llm_config[\"tools\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3715f56c-8ab8-4563-8f00-233beb3959b9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "player_white.register_nested_chats(\n",
+    "    trigger=player_black,\n",
+    "    chat_queue=[\n",
+    "        {\n",
+    "            \"sender\": board_proxy,\n",
+    "            \"recipient\": player_white,\n",
+    "            \"summary_method\": \"last_msg\",\n",
+    "        }\n",
+    "    ],\n",
+    ")\n",
+    "\n",
+    "player_black.register_nested_chats(\n",
+    "    trigger=player_white,\n",
+    "    chat_queue=[\n",
+    "        {\n",
+    "            \"sender\": board_proxy,\n",
+    "            \"recipient\": player_black,\n",
+    "            \"summary_method\": \"last_msg\",\n",
+    "        }\n",
+    "    ],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eda4f544-ab4c-4e9e-bceb-f93ad57c4026",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "board = chess.Board()\n",
+    "\n",
+    "chat_result = player_black.initiate_chat(\n",
+    "    player_white,\n",
+    "    message=\"Let's play chess! Your move.\",\n",
+    "    max_turns=3,\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

+ 784 - 0
recipes/quickstart/agents/dlai/AI_Agents_in_LangGraph_L1_Build_an_Agent_from_Scratch.ipynb

@@ -0,0 +1,784 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "de56ee05-3b71-43c9-8cbf-6ad9b3233f38",
+   "metadata": {},
+   "source": [
+    "<a href=\"https://colab.research.google.com/github/meta-llama/llama-recipes/blob/main/recipes/quickstart/agents/dlai/AI_Agents_in_LangGraph_L1_Build_an_Agent_from_Scratch.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "16ba1896-6867-4c68-9951-b0aadb819782",
+   "metadata": {},
+   "source": [
+    "This notebook ports the DeepLearning.AI short course [AI Agents in LangGraph Lesson 1 Build an Agent from Scratch](https://learn.deeplearning.ai/courses/ai-agents-in-langgraph/lesson/2/build-an-agent-from-scratch) to using Llama 3, with a bonus section that ports the agent from scratch code to using LangGraph, introduced in [Lession 2 LangGraph Components](https://learn.deeplearning.ai/courses/ai-agents-in-langgraph/lesson/3/langgraph-components) of the course. \n",
+    "\n",
+    "You should take the course, especially the first two lessons, before or after going through this notebook, to have a deeper understanding."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9b168b57-6ff8-41d1-8f8f-a0c4a5ff108e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install groq"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2c067d5f-c58c-47c0-8ccd-9a8710711bf7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os \n",
+    "from groq import Groq\n",
+    "\n",
+    "os.environ['GROQ_API_KEY'] = 'your_groq_api_key' # get a free key at https://console.groq.com/keys"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5f7d8d95-36fb-4b14-bd28-99d305c0fded",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# a quick sanity test of calling Llama 3 70b on Groq \n",
+    "# see https://console.groq.com/docs/text-chat for more info\n",
+    "client = Groq()\n",
+    "chat_completion = client.chat.completions.create(\n",
+    "    model=\"llama3-70b-8192\",\n",
+    "    messages=[{\"role\": \"user\", \"content\": \"what are the words Charlotte wrote for the pig?\"}]\n",
+    ")\n",
+    "print(chat_completion.choices[0].message.content)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f758c771-5afe-4008-9d7f-92a6f526778b",
+   "metadata": {},
+   "source": [
+    "### ReAct Agent from Sractch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c00c0479-0913-4a92-8991-fe5a9a936bdb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "client = Groq()\n",
+    "model = \"llama3-8b-8192\" # this model works with the prompt below only for the first simpler example; you'll see how to modify the prompt to make it work for a more complicated question\n",
+    "#model = \"llama3-70b-8192\" # this model works with the prompt below for both example questions \n",
+    "\n",
+    "class Agent:\n",
+    "    def __init__(self, system=\"\"):\n",
+    "        self.system = system\n",
+    "        self.messages = []\n",
+    "        if self.system:\n",
+    "            self.messages.append({\"role\": \"system\", \"content\": system})\n",
+    "\n",
+    "    def __call__(self, message):\n",
+    "        self.messages.append({\"role\": \"user\", \"content\": message})\n",
+    "        result = self.execute()\n",
+    "        self.messages.append({\"role\": \"assistant\", \"content\": result})\n",
+    "        return result\n",
+    "\n",
+    "    def execute(self):\n",
+    "        completion = client.chat.completions.create(\n",
+    "                        model=model,\n",
+    "                        temperature=0,\n",
+    "                        messages=self.messages)\n",
+    "        return completion.choices[0].message.content\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f766fb44-e8c2-43db-af83-8b9053a334ef",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompt = \"\"\"\n",
+    "You run in a loop of Thought, Action, PAUSE, Observation.\n",
+    "At the end of the loop you output an Answer\n",
+    "Use Thought to describe your thoughts about the question you have been asked.\n",
+    "Use Action to run one of the actions available to you - then return PAUSE.\n",
+    "Observation will be the result of running those actions.\n",
+    "\n",
+    "Your available actions are:\n",
+    "\n",
+    "calculate:\n",
+    "e.g. calculate: 4 * 7 / 3\n",
+    "Runs a calculation and returns the number - uses Python so be sure to use floating point syntax if necessary\n",
+    "\n",
+    "average_dog_weight:\n",
+    "e.g. average_dog_weight: Collie\n",
+    "returns average weight of a dog when given the breed\n",
+    "\n",
+    "Example session:\n",
+    "\n",
+    "Question: How much does a Bulldog weigh?\n",
+    "Thought: I should look the dogs weight using average_dog_weight\n",
+    "Action: average_dog_weight: Bulldog\n",
+    "PAUSE\n",
+    "\n",
+    "You will be called again with this:\n",
+    "\n",
+    "Observation: A Bulldog weights 51 lbs\n",
+    "\n",
+    "You then output:\n",
+    "\n",
+    "Answer: A bulldog weights 51 lbs\n",
+    "\"\"\".strip()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "93ab1290-625b-4b69-be4d-210fca43a513",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def calculate(what):\n",
+    "    return eval(what)\n",
+    "\n",
+    "def average_dog_weight(name):\n",
+    "    if name in \"Scottish Terrier\": \n",
+    "        return(\"Scottish Terriers average 20 lbs\")\n",
+    "    elif name in \"Border Collie\":\n",
+    "        return(\"a Border Collies average weight is 37 lbs\")\n",
+    "    elif name in \"Toy Poodle\":\n",
+    "        return(\"a toy poodles average weight is 7 lbs\")\n",
+    "    else:\n",
+    "        return(\"An average dog weights 50 lbs\")\n",
+    "\n",
+    "known_actions = {\n",
+    "    \"calculate\": calculate,\n",
+    "    \"average_dog_weight\": average_dog_weight\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "52f900d9-15f0-4f48-9bf3-6165c70e4b42",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "abot = Agent(prompt)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f1b612c9-2a7d-4325-b36f-182899252538",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result = abot(\"How much does a toy poodle weigh?\")\n",
+    "print(result)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e27dda33-c76d-4a19-8aef-02ba5389e7a6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "abot.messages"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b6e85ca1-85af-43e3-a5ea-c5faf0935361",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# manually call the exeternal func (tool) for now\n",
+    "result = average_dog_weight(\"Toy Poodle\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a9263ac7-fa81-4c95-91c8-a6c0741ab7f8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cb309710-0693-422f-a739-38ca9455e497",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "next_prompt = \"Observation: {}\".format(result)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bd567e42-b5a9-4e4e-8807-38bb1d6c80a4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "abot(next_prompt)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "255bf148-bf85-40c5-b33e-d849a42c127b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "abot.messages"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c286a6d5-b5b3-473b-bad6-aa6f1468e603",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "abot = Agent(prompt)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f5e13b6e-e68e-45c2-b688-a257b531e482",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "question = \"\"\"I have 2 dogs, a border collie and a scottish terrier. \\\n",
+    "What is their combined weight\"\"\"\n",
+    "abot(question)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "049202f1-585f-42c3-8511-08eca7e5ed0e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "abot.messages"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f086f19a-30fe-40ca-aafb-f1ce7c28982d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "next_prompt = \"Observation: {}\".format(average_dog_weight(\"Border Collie\"))\n",
+    "print(next_prompt)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1747c78d-642d-4f57-81a0-27218eab3958",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "abot(next_prompt)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "85809d8f-cd95-4e0a-acb7-9705817bea70",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "abot.messages"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e77591fa-4e04-4eb6-8a40-ca26a71765f9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "next_prompt = \"Observation: {}\".format(average_dog_weight(\"Scottish Terrier\"))\n",
+    "print(next_prompt)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1f72b638-de07-4972-bbdb-8c8602f3d143",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "abot(next_prompt)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eb5bf29d-22f9-4c0d-aea6-7e9c99e71835",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "abot.messages"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a67add73-b3c3-42be-9c54-f8a6ac828869",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "next_prompt = \"Observation: {}\".format(eval(\"37 + 20\"))\n",
+    "print(next_prompt)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "801fda04-9756-4ae4-9990-559216d38be8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "abot(next_prompt)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "56f7b9f4-289f-498d-8bc8-da9bb7365d52",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "abot.messages"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "155ee9b3-a4f9-43dd-b23e-0f268f72f198",
+   "metadata": {},
+   "source": [
+    "### Automate the ReAct action execution"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5b2196f8-88e6-4eb4-82b0-cf251a07e313",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "\n",
+    "# automate the action execution above to make the whole ReAct (Thought - Action- Observsation) process fully automated\n",
+    "action_re = re.compile('^Action: (\\w+): (.*)$')   # python regular expression to selection action"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ea5710d6-5d9a-46ff-a275-46311257d9fd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def query(question, max_turns=5):\n",
+    "    i = 0\n",
+    "    bot = Agent(prompt) # set system prompt\n",
+    "    next_prompt = question\n",
+    "    while i < max_turns:\n",
+    "        i += 1\n",
+    "        result = bot(next_prompt)\n",
+    "        print(result)\n",
+    "        actions = [\n",
+    "            action_re.match(a)\n",
+    "            for a in result.split('\\n')\n",
+    "            if action_re.match(a)\n",
+    "        ]\n",
+    "        if actions:\n",
+    "            # There is an action to run\n",
+    "            action, action_input = actions[0].groups()\n",
+    "            if action not in known_actions:\n",
+    "                raise Exception(\"Unknown action: {}: {}\".format(action, action_input))\n",
+    "            print(\" -- running {} {}\".format(action, action_input))\n",
+    "\n",
+    "            # key to make the agent process fully automated:\n",
+    "            # programtically call the external func with arguments, with the info returned by LLM\n",
+    "            observation = known_actions[action](action_input) \n",
+    "\n",
+    "            print(\"Observation:\", observation)\n",
+    "            next_prompt = \"Observation: {}\".format(observation)\n",
+    "        else:\n",
+    "            return"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "232d0818-7580-424b-9538-1e2b1c15360b",
+   "metadata": {},
+   "source": [
+    "#### Using model \"llama3-8b-8192\", the code below will cause an invalid syntax error because the Action returned is calculate: (average_dog_weight: Border Collie) + (average_dog_weight: Scottish Terrier), instead of the expected \"Action: average_dog_weight: Border Collie\"."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bb0095f3-b3f1-48cf-b3fb-36049b6b91f0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "question = \"\"\"I have 2 dogs, a border collie and a scottish terrier. \\\n",
+    "What is their combined weight\"\"\"\n",
+    "query(question)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "952ffac8-5ec2-48f3-8049-d03c130dad0d",
+   "metadata": {},
+   "source": [
+    "#### Prompt engineering in action:\n",
+    "REPLACE \"Use Thought to describe your thoughts about the question you have been asked. Use Action to run one of the actions available to you - then return PAUSE.\" with \n",
+    "\"First, use Thought to describe your thoughts about the question you have been asked, and generate Action to run one of the actions available to you, then return PAUSE.\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ec791ad6-b39a-4f46-b149-704c23d6c506",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompt = \"\"\"\n",
+    "You run in a loop of Thought, Action, PAUSE, Observation.\n",
+    "At the end of the loop you output an Answer.\n",
+    "First, use Thought to describe your thoughts about the question you have been asked, and generate Action to run one of the actions available to you, then return PAUSE.\n",
+    "Observation will be the result of running those actions.\n",
+    "\n",
+    "Your available actions are:\n",
+    "\n",
+    "calculate:\n",
+    "e.g. calculate: 4 * 7 / 3\n",
+    "Runs a calculation and returns the number - uses Python so be sure to use floating point syntax if necessary\n",
+    "\n",
+    "average_dog_weight:\n",
+    "e.g. average_dog_weight: Collie\n",
+    "returns average weight of a dog when given the breed\n",
+    "\n",
+    "Example session:\n",
+    "\n",
+    "Question: How much does a Bulldog weigh?\n",
+    "Thought: I should look the dogs weight using average_dog_weight\n",
+    "Action: average_dog_weight: Bulldog\n",
+    "PAUSE\n",
+    "\n",
+    "You will be called again with this:\n",
+    "\n",
+    "Observation: A Bulldog weights 51 lbs\n",
+    "\n",
+    "You then output:\n",
+    "\n",
+    "Answer: A bulldog weights 51 lbs\n",
+    "\"\"\".strip()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "90bcf731-4d89-473b-98e1-53826da149f9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "question = \"\"\"I have 2 dogs, a border collie and a scottish terrier. \\\n",
+    "What is their combined weight\"\"\"\n",
+    "query(question)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "09d30a8f-3783-4ee5-a48e-7d89e22a508a",
+   "metadata": {},
+   "source": [
+    "### Bonus: Port the Agent Implementation to LangGraph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6b5ed82e-2d70-45ac-b026-904da211f81a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install langchain\n",
+    "!pip install langgraph\n",
+    "!pip install langchain_openai\n",
+    "!pip install langchain_community\n",
+    "!pip install httpx\n",
+    "!pip install langchain-groq"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a8ed3b90-688e-4aa2-8e43-e951af29a57f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langgraph.graph import StateGraph, END\n",
+    "from typing import TypedDict, Annotated\n",
+    "import operator\n",
+    "from langchain_core.messages import AnyMessage, SystemMessage, HumanMessage, ToolMessage\n",
+    "from langchain_openai import ChatOpenAI\n",
+    "from langchain_community.tools.tavily_search import TavilySearchResults"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c555f945-7db0-4dc9-9ea5-5632bf941fe4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_groq import ChatGroq\n",
+    "\n",
+    "model = ChatGroq(temperature=0, model_name=\"llama3-8b-8192\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7755a055-fa1f-474f-b558-230cc5a67a33",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_core.tools import tool\n",
+    "from langgraph.prebuilt import ToolNode\n",
+    "\n",
+    "@tool\n",
+    "def calculate(what):\n",
+    "    \"\"\"Runs a calculation and returns the number.\"\"\"\n",
+    "    return eval(what)\n",
+    "\n",
+    "@tool\n",
+    "def average_dog_weight(name):\n",
+    "    \"\"\"Returns the average weight of a dog.\"\"\"\n",
+    "    if name in \"Scottish Terrier\":\n",
+    "        return(\"Scottish Terriers average 20 lbs\")\n",
+    "    elif name in \"Border Collie\":\n",
+    "        return(\"a Border Collies average weight is 37 lbs\")\n",
+    "    elif name in \"Toy Poodle\":\n",
+    "        return(\"a toy poodles average weight is 7 lbs\")\n",
+    "    else:\n",
+    "        return(\"An average dog weights 50 lbs\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4a003862-8fd2-45b1-8fe4-78d7cd5888d9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompt = \"\"\"\n",
+    "You run in a loop of Thought, Action, Observation.\n",
+    "At the end of the loop you output an Answer\n",
+    "Use Thought to describe your thoughts about the question you have been asked.\n",
+    "Use Action to run one of the actions available to you.\n",
+    "Observation will be the result of running those actions.\n",
+    "\n",
+    "Your available actions are:\n",
+    "\n",
+    "calculate:\n",
+    "e.g. calculate: 4 * 7 / 3\n",
+    "Runs a calculation and returns the number - uses Python so be sure to use floating point syntax if necessary\n",
+    "\n",
+    "average_dog_weight:\n",
+    "e.g. average_dog_weight: Collie\n",
+    "returns average weight of a dog when given the breed\n",
+    "\n",
+    "Example session:\n",
+    "\n",
+    "Question: How much does a Bulldog weigh?\n",
+    "Thought: I should look the dogs weight using average_dog_weight\n",
+    "Action: average_dog_weight: Bulldog\n",
+    "\n",
+    "You will be called again with this:\n",
+    "\n",
+    "Observation: A Bulldog weights 51 lbs\n",
+    "\n",
+    "You then output:\n",
+    "\n",
+    "Answer: A bulldog weights 51 lbs\n",
+    "\"\"\".strip()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "471c0aa7-547f-4d5f-9e99-73ef47101d41",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class AgentState(TypedDict):\n",
+    "    messages: Annotated[list[AnyMessage], operator.add]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "530e8a60-085a-4485-af03-bafc6b2c1d88",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Agent:\n",
+    "\n",
+    "    def __init__(self, model, tools, system=\"\"):\n",
+    "        self.system = system\n",
+    "        graph = StateGraph(AgentState)\n",
+    "        graph.add_node(\"llm\", self.call_llm)\n",
+    "        graph.add_node(\"action\", self.take_action)\n",
+    "        graph.add_conditional_edges(\n",
+    "            \"llm\",\n",
+    "            self.exists_action,\n",
+    "            {True: \"action\", False: END}\n",
+    "        )\n",
+    "        graph.add_edge(\"action\", \"llm\")\n",
+    "        graph.set_entry_point(\"llm\")\n",
+    "        self.graph = graph.compile()\n",
+    "        self.tools = {t.name: t for t in tools}\n",
+    "        self.model = model.bind_tools(tools)\n",
+    "\n",
+    "    def exists_action(self, state: AgentState):\n",
+    "        result = state['messages'][-1]\n",
+    "        return len(result.tool_calls) > 0\n",
+    "\n",
+    "    def call_llm(self, state: AgentState):\n",
+    "        messages = state['messages']\n",
+    "        if self.system:\n",
+    "            messages = [SystemMessage(content=self.system)] + messages\n",
+    "        message = self.model.invoke(messages)\n",
+    "        return {'messages': [message]}\n",
+    "\n",
+    "    def take_action(self, state: AgentState):\n",
+    "        tool_calls = state['messages'][-1].tool_calls\n",
+    "        results = []\n",
+    "        for t in tool_calls:\n",
+    "            print(f\"Calling: {t}\")\n",
+    "            result = self.tools[t['name']].invoke(t['args'])\n",
+    "            results.append(ToolMessage(tool_call_id=t['id'], name=t['name'], content=str(result)))\n",
+    "        print(\"Back to the model!\")\n",
+    "        return {'messages': results}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3db8dcea-d4eb-46df-bd90-55acd4c5520a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "abot = Agent(model, [calculate, average_dog_weight], system=prompt)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "72c62e36-9321-40d2-86d8-b3c9caf3020f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "messages = [HumanMessage(content=\"How much does a Toy Poodle weigh?\")]\n",
+    "result = abot.graph.invoke({\"messages\": messages})\n",
+    "result['messages'], result['messages'][-1].content\n",
+    "\n",
+    "# the code above will cause an error because Llama 3 8B incorrectly returns an extra \"calculate\" tool call"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "56b4c622-b306-4aa3-84e6-4ccd6d6f272f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# using the Llama 3 70B will fix the error\n",
+    "model = ChatGroq(temperature=0, model_name=\"llama3-70b-8192\")\n",
+    "abot = Agent(model, [calculate, average_dog_weight], system=prompt)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "629ca375-979a-45d7-bad8-7240ae9ad844",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Toy Poodle case sensitive here - can be fixed easily by modifying def average_dog_weight\n",
+    "messages = [HumanMessage(content=\"How much does a Toy Poodle weigh?\")]\n",
+    "result = abot.graph.invoke({\"messages\": messages})\n",
+    "result['messages'], result['messages'][-1].content"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "30e253ae-e742-4df8-92e6-fadfc3826003",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "messages = [HumanMessage(content=\"I have 2 dogs, a border collie and a scottish terrier. What are their average weights? Total weight?\")]\n",
+    "result = abot.graph.invoke({\"messages\": messages})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "238ec75c-4ff6-4561-bb0a-895530a61e47",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result['messages'], result['messages'][-1].content"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

+ 355 - 0
recipes/quickstart/agents/dlai/Building_Agentic_RAG_with_Llamaindex_L1_Router_Engine.ipynb

@@ -0,0 +1,355 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<a href=\"https://colab.research.google.com/github/meta-llama/llama-recipes/blob/main/recipes/quickstart/agents/dlai/Building_Agentic_RAG_with_Llamaindex_L1_Router_Engine.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This notebook ports the DeepLearning.AI short course [Building Agentic RAG with Llamaindex Lesson 1 Router Engine](https://learn.deeplearning.ai/courses/building-agentic-rag-with-llamaindex/lesson/2/router-query-engine) to using Llama 3. \n",
+    "\n",
+    "You should take the course before or after going through this notebook to have a deeper understanding."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "eiJsOa29ej7G",
+    "outputId": "094c60c2-d782-4baf-bfc3-913f53ac1ff3"
+   },
+   "outputs": [],
+   "source": [
+    "!pip install llama-index\n",
+    "!pip install llama-index-embeddings-huggingface\n",
+    "!pip install llama-index-llms-groq"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "Wv_-jm5XGoWo",
+    "outputId": "d0dd37f4-ca33-4704-d221-6ea80ee09eb5"
+   },
+   "outputs": [],
+   "source": [
+    "import os \n",
+    "os.environ['GROQ_API_KEY'] = 'your_groq_api_key' # get a free key at https://console.groq.com/keys"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "aUi629qVUsG5",
+    "outputId": "015a6a1a-a9e1-4c4f-dcaf-7d341b1a0b0e"
+   },
+   "outputs": [],
+   "source": [
+    "!wget \"https://openreview.net/pdf?id=VtmBAGCN7o\" -O metagpt.pdf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "auZQalH5J7CU"
+   },
+   "outputs": [],
+   "source": [
+    "from llama_index.core import SimpleDirectoryReader\n",
+    "\n",
+    "documents = SimpleDirectoryReader(input_files=[\"metagpt.pdf\"]).load_data()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "GFfUjJypJ7Eq"
+   },
+   "outputs": [],
+   "source": [
+    "from llama_index.core.node_parser import SentenceSplitter\n",
+    "\n",
+    "splitter = SentenceSplitter(chunk_size=1024)\n",
+    "nodes = splitter.get_nodes_from_documents(documents)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "bguUp2D5LhST"
+   },
+   "outputs": [],
+   "source": [
+    "from llama_index.llms.groq import Groq\n",
+    "\n",
+    "from llama_index.core import Settings, VectorStoreIndex\n",
+    "from llama_index.embeddings.huggingface import HuggingFaceEmbedding\n",
+    "\n",
+    "llm = Groq(model=\"llama3-8b-8192\") #, api_key=GROQ_API_TOKEN)\n",
+    "Settings.llm = llm\n",
+    "#llm.complete(\"Who wrote the book godfather\").text\n",
+    "\n",
+    "Settings.embed_model = HuggingFaceEmbedding(\n",
+    "    model_name=\"BAAI/bge-small-en-v1.5\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "pkmnSHsMJ7Hg"
+   },
+   "outputs": [],
+   "source": [
+    "from llama_index.core import SummaryIndex, VectorStoreIndex\n",
+    "\n",
+    "summary_index = SummaryIndex(nodes)\n",
+    "vector_index = VectorStoreIndex(nodes)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "BnMq-qO9ezcE"
+   },
+   "outputs": [],
+   "source": [
+    "summary_query_engine = summary_index.as_query_engine(\n",
+    "    response_mode=\"tree_summarize\",\n",
+    "    use_async=True,\n",
+    ")\n",
+    "vector_query_engine = vector_index.as_query_engine()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "vnK7YXlVLojh"
+   },
+   "outputs": [],
+   "source": [
+    "from llama_index.core.tools import QueryEngineTool\n",
+    "\n",
+    "summary_tool = QueryEngineTool.from_defaults(\n",
+    "    query_engine=summary_query_engine,\n",
+    "    description=(\n",
+    "        \"Useful for summarization questions related to MetaGPT\"\n",
+    "    ),\n",
+    ")\n",
+    "\n",
+    "vector_tool = QueryEngineTool.from_defaults(\n",
+    "    query_engine=vector_query_engine,\n",
+    "    description=(\n",
+    "        \"Useful for retrieving specific context from the MetaGPT paper.\"\n",
+    "    ),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "EvLJilU4LomU"
+   },
+   "outputs": [],
+   "source": [
+    "from llama_index.core.query_engine.router_query_engine import RouterQueryEngine\n",
+    "from llama_index.core.selectors import LLMSingleSelector\n",
+    "\n",
+    "query_engine = RouterQueryEngine(\n",
+    "    selector=LLMSingleSelector.from_defaults(),\n",
+    "    query_engine_tools=[\n",
+    "        summary_tool,\n",
+    "        vector_tool,\n",
+    "    ],\n",
+    "    verbose=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "O_HfeD6TMCJf"
+   },
+   "outputs": [],
+   "source": [
+    "import nest_asyncio\n",
+    "\n",
+    "nest_asyncio.apply()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "KAjQW6M1LopK",
+    "outputId": "fd85f083-599f-40b0-8fed-c3c96b09c5c9"
+   },
+   "outputs": [],
+   "source": [
+    "response = query_engine.query(\"What is the summary of the document?\")\n",
+    "print(str(response))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "ySNeKv3PLosR",
+    "outputId": "9568d416-9ec1-4f50-dbbf-61a208205113"
+   },
+   "outputs": [],
+   "source": [
+    "print(len(response.source_nodes))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "KOP52yxpLou-",
+    "outputId": "168a6ff3-2a2a-4588-dc2f-978ae8c0bbc1"
+   },
+   "outputs": [],
+   "source": [
+    "response = query_engine.query(\n",
+    "    \"How do agents share information with other agents? This is not a summarization question.\"\n",
+    ")\n",
+    "print(str(response))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "luGziGzfMHbD"
+   },
+   "outputs": [],
+   "source": [
+    "def get_router_query_engine(file_path: str):\n",
+    "    \"\"\"Get router query engine.\"\"\"\n",
+    "\n",
+    "    documents = SimpleDirectoryReader(input_files=[file_path]).load_data()\n",
+    "\n",
+    "    splitter = SentenceSplitter(chunk_size=1024)\n",
+    "    nodes = splitter.get_nodes_from_documents(documents)\n",
+    "\n",
+    "    summary_index = SummaryIndex(nodes)\n",
+    "    vector_index = VectorStoreIndex(nodes)\n",
+    "\n",
+    "    summary_query_engine = summary_index.as_query_engine(\n",
+    "        response_mode=\"tree_summarize\",\n",
+    "        use_async=True,\n",
+    "    )\n",
+    "    vector_query_engine = vector_index.as_query_engine()\n",
+    "\n",
+    "    summary_tool = QueryEngineTool.from_defaults(\n",
+    "        query_engine=summary_query_engine,\n",
+    "        description=(\n",
+    "            \"Useful for summarization questions related to MetaGPT\"\n",
+    "        ),\n",
+    "    )\n",
+    "\n",
+    "    vector_tool = QueryEngineTool.from_defaults(\n",
+    "        query_engine=vector_query_engine,\n",
+    "        description=(\n",
+    "            \"Useful for retrieving specific context from the MetaGPT paper.\"\n",
+    "        ),\n",
+    "    )\n",
+    "\n",
+    "    query_engine = RouterQueryEngine(\n",
+    "        selector=LLMSingleSelector.from_defaults(),\n",
+    "        query_engine_tools=[\n",
+    "            summary_tool,\n",
+    "            vector_tool,\n",
+    "        ],\n",
+    "        verbose=True\n",
+    "    )\n",
+    "    return query_engine\n",
+    "\n",
+    "query_engine = get_router_query_engine(\"metagpt.pdf\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "SHqbTBxQNryl",
+    "outputId": "ca2f16e9-2274-42e6-fdd5-3218d8f0c3f8"
+   },
+   "outputs": [],
+   "source": [
+    "response = query_engine.query(\"Tell me about the ablation study results?\")\n",
+    "print(str(response))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "5r1MHbLOPT8Y"
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

+ 581 - 0
recipes/quickstart/agents/dlai/Functions_Tools_and_Agents_with_LangChain_L1_Function_Calling.ipynb

@@ -0,0 +1,581 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "2ba1b4ef-3b96-4e7e-b5d0-155b839db73c",
+   "metadata": {},
+   "source": [
+    "<a href=\"https://colab.research.google.com/github/meta-llama/llama-recipes/blob/main/recipes/quickstart/agents/dlai/Functions_Tools_and_Agents_with_LangChain_L1_Function_Calling.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f91905c8-21ca-4d81-9614-b9c7344d08c3",
+   "metadata": {},
+   "source": [
+    "This notebook ports the DeepLearning.AI short course [Functions, Tools and Agents with LangChain Lesson 1 OpenAI Function Calling](https://learn.deeplearning.ai/courses/functions-tools-agents-langchain/lesson/2/openai-function-calling) to using Llama 3. \n",
+    "\n",
+    "You should take the course before or after going through this notebook to have a deeper understanding."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "31bfe801-e24d-459b-8b3f-e91a34024368",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install groq"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "88659373-0deb-45eb-8934-0b02d70bd047",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "\n",
+    "# Example dummy function hard coded to return the same weather\n",
+    "# In production, this could be your backend API or an external API\n",
+    "def get_current_weather(location, unit=\"fahrenheit\"):\n",
+    "    \"\"\"Get the current weather in a given location\"\"\"\n",
+    "    weather_info = {\n",
+    "        \"location\": location,\n",
+    "        \"temperature\": \"72\",\n",
+    "        \"unit\": unit,\n",
+    "        \"forecast\": [\"sunny\", \"windy\"],\n",
+    "    }\n",
+    "    return json.dumps(weather_info)\n",
+    "\n",
+    "known_functions = {\n",
+    "    \"get_current_weather\": get_current_weather\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "359a584a-5b26-4497-afb4-72b63027edb9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# https://console.groq.com/docs/tool-use#models\n",
+    "# Groq API endpoints support tool use for programmatic execution of specified operations through requests with explicitly defined \n",
+    "# operations. With tool use, Groq API model endpoints deliver structured JSON output that can be used to directly invoke functions.\n",
+    "\n",
+    "from groq import Groq\n",
+    "import os\n",
+    "import json\n",
+    "\n",
+    "client = Groq(api_key = 'your_groq_api_key' # get a free key at https://console.groq.com/keys')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5cc17dc9-2827-4d39-a13d-a4ed5f53c8e6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "functions = [\n",
+    "    {\n",
+    "        \"name\": \"get_current_weather\",\n",
+    "        \"description\": \"Get the current weather in a given location\",\n",
+    "        \"parameters\": {\n",
+    "            \"type\": \"object\",\n",
+    "            \"properties\": {\n",
+    "                \"location\": {\n",
+    "                    \"type\": \"string\",\n",
+    "                    \"description\": \"The city and state, e.g. San Francisco, CA\",\n",
+    "                },\n",
+    "                \"unit\": {\"type\": \"string\", \"enum\": [\"celsius\", \"fahrenheit\"]},\n",
+    "            },\n",
+    "            \"required\": [\"location\"],\n",
+    "        },\n",
+    "    }\n",
+    "]\n",
+    "\n",
+    "tools = [\n",
+    "    {\n",
+    "        \"type\": \"function\",\n",
+    "        \"function\": {\n",
+    "            \"name\": \"get_current_weather\",\n",
+    "            \"description\": \"Get the current weather in a given location\",\n",
+    "            \"parameters\": {\n",
+    "                \"type\": \"object\",\n",
+    "                \"properties\": {\n",
+    "                    \"location\": {\n",
+    "                        \"type\": \"string\",\n",
+    "                        \"description\": \"The city and state, e.g. San Francisco, CA\",\n",
+    "                    },\n",
+    "                    \"unit\": {\"type\": \"string\", \"enum\": [\"celsius\", \"fahrenheit\"]},\n",
+    "                },\n",
+    "                \"required\": [\"location\"],\n",
+    "            },\n",
+    "        }\n",
+    "    }\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5a64d28e-b169-4855-b3c2-d6722c56394c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "messages = [\n",
+    "    {\n",
+    "        \"role\": \"user\",\n",
+    "        \"content\": \"What's the weather like in Boston?\"\n",
+    "    }\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a733c1e1-c7f2-4707-b1be-02179df0abc6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = client.chat.completions.create(\n",
+    "    model=\"llama3-70b-8192\",\n",
+    "    messages=messages,\n",
+    "    functions=functions,\n",
+    "    #tools=tools, # you can also replace functions with tools, as specified in https://console.groq.com/docs/tool-use \n",
+    "    max_tokens=4096, \n",
+    "    temperature=0\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9058073d-cf91-4747-9860-7e2a1d774acf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ffd4ed64-0436-499e-a7e5-4224833b72f3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response_message = response.choices[0].message\n",
+    "response_message"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5458444a-a448-4c5b-b06c-47ab6cd25626",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response_message.content"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c669a048-1a3e-43e9-b98f-d0b6a3a0f4c0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response_message.function_call"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "27f3de5d-5110-486e-8b07-5086939d364d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "json.loads(response_message.function_call.arguments)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b69e6497-9e68-47d4-99ae-d45db6c1a8db",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "args = json.loads(response_message.function_call.arguments)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f41a7162-9ce8-4353-827b-f6f3bb278218",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "get_current_weather(args)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bb0546f2-de55-417a-9b38-66787b673fb7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "function_call = response.choices[0].message.function_call\n",
+    "function_call"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0dd1fcf0-7105-4cad-82b5-22ce3b24fc07",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "function_call.name, function_call.arguments"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e6d58efe-0ada-48a2-b12b-6bff948a2983",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# by defining and using known_functions, we can programatically call function\n",
+    "function_response = known_functions[function_call.name](function_call.arguments)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cee6ca19-6924-4a7b-ba7f-7b1a33344ca0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "function_response"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8480be29-3326-4d95-8742-dff976a7ab2e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# add the message returned by tool and query LLM again to get final answer\n",
+    "messages.append(\n",
+    "{\n",
+    "    \"role\": \"function\",\n",
+    "    \"name\": function_call.name,\n",
+    "    \"content\": function_response,\n",
+    "}\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4a42e35f-c601-4c14-8de5-bdbba01dc622",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "messages"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f9a2d1ee-9e41-480a-a5cc-62c273d3a179",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = client.chat.completions.create(\n",
+    "    model=\"llama3-70b-8192\",\n",
+    "    messages=messages,\n",
+    "    temperature=0\n",
+    ")\n",
+    "\n",
+    "response.choices[0].message.content"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "54019c56-11cf-465a-a440-296081adee93",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "messages = [\n",
+    "    {\n",
+    "        \"role\": \"user\",\n",
+    "        \"content\": \"hi!\",\n",
+    "    }\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "922724ec-1744-4ccf-9a86-5f1823dce0e0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = client.chat.completions.create(\n",
+    "    model=\"llama3-70b-8192\",\n",
+    "    messages=messages,\n",
+    "    functions=functions,\n",
+    "    function_call=\"none\", # default is auto (let LLM decide if using function call or not. can also be none, or a dict {{\"name\": \"func_name\"}\n",
+    "    temperature=0\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "04c3152a-f51b-45cb-a27c-0672337520b4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "582fac7c-0de7-420c-8150-038e74be4b9a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response_message = response.choices[0].message\n",
+    "response_message"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e3d62357-04c9-459c-b36a-89e58444ea63",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "messages = [\n",
+    "    {\n",
+    "        \"role\": \"user\",\n",
+    "        \"content\": \"hi!\",\n",
+    "    }\n",
+    "]\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"llama3-70b-8192\",\n",
+    "    messages=messages,\n",
+    "    functions=functions,\n",
+    "    function_call=\"auto\", # default is auto (let LLM decide if using function call or not. can also be none, or a dict {{\"name\": \"func_name\"}\n",
+    "    temperature=0\n",
+    ")\n",
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "632df69d-85bc-4e44-814c-7c1d2fe97228",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "messages = [\n",
+    "    {\n",
+    "        \"role\": \"user\",\n",
+    "        \"content\": \"hi!\",\n",
+    "    }\n",
+    "]\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"llama3-70b-8192\",\n",
+    "    messages=messages,\n",
+    "    functions=functions,\n",
+    "    function_call=\"none\", # default is auto (let LLM decide if using function call or not. can also be none, or a dict {{\"name\": \"func_name\"}\n",
+    "    temperature=0\n",
+    ")\n",
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c773ab17-a620-44eb-877f-9e0bc23fb00b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "messages = [\n",
+    "    {\n",
+    "        \"role\": \"user\",\n",
+    "        \"content\": \"What's the weather in Boston?\",\n",
+    "    }\n",
+    "]\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"llama3-70b-8192\",\n",
+    "    messages=messages,\n",
+    "    functions=functions,\n",
+    "    function_call=\"none\", # default is auto (let LLM decide if using function call or not. can also be none, or a dict {{\"name\": \"func_name\"}\n",
+    "    temperature=0\n",
+    ")\n",
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c4a8ee80-83ae-4189-837c-54bb9c93c315",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "messages = [\n",
+    "    {\n",
+    "        \"role\": \"user\",\n",
+    "        \"content\": \"hi!\",\n",
+    "    }\n",
+    "]\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"llama3-70b-8192\",\n",
+    "    messages=messages,\n",
+    "    functions=functions,\n",
+    "    function_call={\"name\": \"get_current_weather\"}, # default is auto (let LLM decide if using function call or not. can also be none, or a dict {{\"name\": \"func_name\"}\n",
+    "    temperature=0\n",
+    ")\n",
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "daa5801a-2e71-4630-a8cd-7e84d1214f51",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "messages = [\n",
+    "    {\n",
+    "        \"role\": \"user\",\n",
+    "        \"content\": \"What's the weather like in Boston!\",\n",
+    "    }\n",
+    "]\n",
+    "response = client.chat.completions.create(\n",
+    "    model=\"llama3-70b-8192\",\n",
+    "    messages=messages,\n",
+    "    functions=functions,\n",
+    "    function_call={\"name\": \"get_current_weather\"}, # default is auto (let LLM decide if using function call or not. can also be none, or a dict {{\"name\": \"func_name\"}\n",
+    "    temperature=0\n",
+    ")\n",
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "de5924d4-4225-48d1-a390-e44f3167d547",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "function_call = response.choices[0].message.function_call\n",
+    "function_call.name, function_call.arguments"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fb9f3340-b905-47f3-a478-cf3d786faa1f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "args = json.loads(response.choices[0].message.function_call.arguments)\n",
+    "observation = known_functions[function_call.name](args)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3c31e9b5-99ed-46f3-8849-133c71ea87d4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "observation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b73c550e-5aa2-49de-8422-0c3e706f1df4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "messages.append(\n",
+    "        {\n",
+    "            \"role\": \"function\",\n",
+    "            \"name\": function_call.name,\n",
+    "            \"content\": observation,\n",
+    "        }\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c60302f1-07e2-4f22-bd60-b54e1ea2e3db",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "messages"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a35f7f3d-4e39-4744-b5e3-2065e67eea28",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = client.chat.completions.create(\n",
+    "    model=\"llama3-70b-8192\",\n",
+    "    messages=messages,\n",
+    ")\n",
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7d4745e1-0477-4b6b-84de-9c82e0bc2452",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response.choices[0].message.content"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

+ 11 - 0
recipes/quickstart/agents/dlai/README.md

@@ -0,0 +1,11 @@
+# Quickstart Llama 3 Powered Agent Examples Ported from DeepLearning.ai Short Courses
+
+The notebooks in this folder are ported from the 4 recent agent short courses on [Deeplearning.ai](https://www.deeplearning.ai) to use Llama 3 to build agent apps from scratch or with open source frameworks (LangChain, LlamaIndex, AutoGen).
+
+1. [Functions Tools and Agents with LangChain L1 Function Calling](Functions_Tools_and_Agents_with_LangChain_L1_Function_Calling.ipynb)
+
+2. [AI Agents in LangGraph L1 Build an Agent from Scratch](AI_Agents_in_LangGraph_L1_Build_an_Agent_from_Scratch.ipynb)
+
+3. [Building Agentic RAG with Llamaindex L1 Router Engine](Building_Agentic_RAG_with_Llamaindex_L1_Router_Engine.ipynb)
+
+4. [AI Agentic Design Patterns with AutoGen L4 Tool Use and Conversational Chess](AI_Agentic_Design_Patterns_with_AutoGen_L4_Tool_Use_and_Conversational_Chess.ipynb)

+ 4 - 6
recipes/quickstart/finetuning/LLM_finetuning_overview.md

@@ -1,6 +1,6 @@
 ## LLM Fine-Tuning
 
-Here we discuss fine-tuning Meta Llama 3 with a couple of different recipes. We will cover two scenarios here:
+Here we discuss fine-tuning Meta Llama with a couple of different recipes. We will cover two scenarios here:
 
 
 ## 1. **Parameter Efficient Model Fine-Tuning**
@@ -18,8 +18,6 @@ These methods will address three aspects:
 
 HF [PEFT](https://github.com/huggingface/peft) library provides an easy way of using these methods which we make use of here. Please read more [here](https://huggingface.co/blog/peft).
 
-
-
 ## 2. **Full/ Partial Parameter Fine-Tuning**
 
 Full parameter fine-tuning has its own advantages, in this method there are multiple strategies that can help:
@@ -35,9 +33,9 @@ Full parameter fine-tuning has its own advantages, in this method there are mult
 You can also keep most of the layers frozen and only fine-tune a few layers. There are many different techniques to choose from to freeze/unfreeze layers based on different criteria.
 
 <div style="display: flex;">
-    <img src="../../docs/img/feature_based_fn.png" alt="Image 1" width="250" />
-    <img src="../../docs/img/feature_based_fn_2.png" alt="Image 2" width="250" />
-    <img src="../../docs/img/full_param_fn.png" alt="Image 3" width="250" />
+    <img src="../../../docs/img/feature_based_fn.png" alt="Image 1" width="250" />
+    <img src="../../../docs/img/feature_based_fn_2.png" alt="Image 2" width="250" />
+    <img src="../../../docs/img/full_param_fn.png" alt="Image 3" width="250" />
 </div>
 
 

+ 3 - 3
recipes/quickstart/finetuning/README.md

@@ -54,7 +54,7 @@ It lets us specify the training settings for everything from `model_name` to `da
     output_dir: str = "PATH/to/save/PEFT/model"
     freeze_layers: bool = False
     num_freeze_layers: int = 1
-    quantization: bool = False
+    quantization: str = None
     one_gpu: bool = False
     save_model: bool = True
     dist_checkpoint_root_folder: str="PATH/to/save/FSDP/model" # will be used if using FSDP
@@ -101,11 +101,11 @@ It lets us specify the training settings for everything from `model_name` to `da
 You can enable [W&B](https://wandb.ai/) experiment tracking by using `use_wandb` flag as below. You can change the project name, entity and other `wandb.init` arguments in `wandb_config`.
 
 ```bash
-python -m llama_recipes.finetuning --use_peft --peft_method lora --quantization --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model --use_wandb
+python -m llama_recipes.finetuning --use_peft --peft_method lora --quantization 8bit --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model --use_wandb
 ```
 You'll be able to access a dedicated project or run link on [wandb.ai](https://wandb.ai) and see your dashboard like the one below.
 <div style="display: flex;">
-    <img src="../../../docs/images/wandb_screenshot.png" alt="wandb screenshot" width="500" />
+    <img src="../../../docs/img/wandb_screenshot.png" alt="wandb screenshot" width="500" />
 </div>
 
 ## FLOPS Counting and Pytorch Profiling

+ 3 - 3
recipes/quickstart/finetuning/datasets/README.md

@@ -32,18 +32,18 @@ To supply a custom dataset you need to provide a single .py file which contains
 ```@python
 def get_custom_dataset(dataset_config, tokenizer, split: str):
 ```
-For an example `get_custom_dataset` you can look at the provided datasets in llama_recipes.datasets or [examples/custom_dataset.py](custom_dataset.py).
+For an example `get_custom_dataset` you can look at the provided datasets in llama_recipes.datasets or [custom_dataset.py](./custom_dataset.py).
 The `dataset_config` in the above signature will be an instance of llama_recipes.configs.dataset.custom_dataset with the modifications made through the command line.
 The split signals wether to return the training or validation dataset.
 The default function name is `get_custom_dataset` but this can be changed as described below.
 
 In order to start a training with the custom dataset we need to set the `--dataset` as well as the `--custom_dataset.file` parameter.
 ```
-python -m llama_recipes.finetuning --dataset "custom_dataset" --custom_dataset.file "examples/custom_dataset.py" [TRAINING PARAMETERS]
+python -m llama_recipes.finetuning --dataset "custom_dataset" --custom_dataset.file "custom_dataset.py" [TRAINING PARAMETERS]
 ```
 To change the function name that is used in the .py you can append the name following a `:` like this:
 ```
-python -m llama_recipes.finetuning --dataset "custom_dataset" --custom_dataset.file "examples/custom_dataset.py:get_foo" [TRAINING PARAMETERS]
+python -m llama_recipes.finetuning --dataset "custom_dataset" --custom_dataset.file "custom_dataset.py:get_foo" [TRAINING PARAMETERS]
 ```
 This will call the function `get_foo` instead of `get_custom_dataset` when retrieving the dataset.
 

+ 38 - 5
recipes/quickstart/finetuning/multigpu_finetuning.md

@@ -18,6 +18,14 @@ We will also need 2 packages:
 ## How to run it
 Get access to a machine with multiple GPUs (in this case we tested with 4 A100 and A10s).
 
+### With FSDP + QLORA
+
+This has been tested on 4 H100s GPUs.
+
+```bash
+ FSDP_CPU_RAM_EFFICIENT_LOADING=1 ACCELERATE_USE_FSDP=1 torchrun --nnodes 1 --nproc_per_node 4  finetuning.py --enable_fsdp  --quantization int4 --model_name /path_of_model_folder/70B  --mixed_precision False --low_cpu_fsdp --use_peft --peft_method lora --output_dir Path/to/save/PEFT/model
+```
+
 ### With FSDP + PEFT
 
 <details open>
@@ -49,7 +57,7 @@ The args used in the command above are:
 If interested in running full parameter finetuning without making use of PEFT methods, please use the following command. Make sure to change the `nproc_per_node` to your available GPUs. This has been tested with `BF16` on 8xA100, 40GB GPUs.
 
 ```bash
-torchrun --nnodes 1 --nproc_per_node 8  finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --use_fast_kernels
+torchrun --nnodes 1 --nproc_per_node 8  finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --fsdp_config.pure_bf16 --use_fast_kernels
 ```
 
 ### Using less CPU memory (FSDP on 70B model)
@@ -57,10 +65,35 @@ torchrun --nnodes 1 --nproc_per_node 8  finetuning.py --enable_fsdp --model_name
 If you are running full parameter fine-tuning on the 70B model, you can enable `low_cpu_fsdp` mode as the following command. This option will load model on rank0 only before moving model to devices to construct FSDP. This can dramatically save cpu memory when loading large models like 70B (on a 8-gpu node, this reduces cpu memory from 2+T to 280G for 70B model). This has been tested with `BF16` on 16xA100, 80GB GPUs.
 
 ```bash
-torchrun --nnodes 1 --nproc_per_node 8 finetuning.py --enable_fsdp --low_cpu_fsdp --pure_bf16 --model_name /path_of_model_folder/70B --batch_size_training 1 --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned
+torchrun --nnodes 1 --nproc_per_node 8 finetuning.py --enable_fsdp --low_cpu_fsdp --fsdp_config.pure_bf16 --model_name /path_of_model_folder/70B --batch_size_training 1 --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned
+```
+
+**Multi GPU multi node**:
+
+Here we use a slurm script to schedule a job with slurm over multiple nodes.
+
+```bash
+
+sbatch recipes/quickstart/finetuning/multi_node.slurm
+# Change the num nodes and GPU per nodes in the script before running.
+
+```
+
+To fine-tune the Meta Llama 405B model with LoRA on 32xH100, 80 GB GPUs we need to combine 4bit quantization (QLoRA) and FSDP.
+We can achieve this by adding the following environment variables to the slurm script (before the srun command in the bottom).
+
+```bash
+export FSDP_CPU_RAM_EFFICIENT_LOADING=1
+export ACCELERATE_USE_FSDP=1 
 ```
 
+Then we need to replace the bottom srun command with the following:
+
+```bash
+srun  torchrun --nproc_per_node 8 --rdzv_id $RANDOM --rdzv_backend c10d --rdzv_endpoint $head_node_ip:29500 ./finetuning.py  --enable_fsdp --use_peft --peft_method lora --quantization 4bit  --quantization_config.quant_type nf4 --mixed_precision False --low_cpu_fsdp
+```
 
+Do not forget to adjust the number of nodes, ntasks and gpus-per-task in the top.
 
 ## Running with different datasets
 Currently 3 open source datasets are supported that can be found in [Datasets config file](../../../src/llama_recipes/configs/datasets.py). You can also use your custom dataset (more info [here](./datasets/README.md)).
@@ -79,16 +112,16 @@ To run with each of the datasets set the `dataset` flag in the command as shown
 
 ```bash
 # grammer_dataset
-torchrun --nnodes 1 --nproc_per_node 4  finetuning.py --enable_fsdp  --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset grammar_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned  --pure_bf16 --output_dir Path/to/save/PEFT/model
+torchrun --nnodes 1 --nproc_per_node 4  finetuning.py --enable_fsdp  --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset grammar_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned  --fsdp_config.pure_bf16 --output_dir Path/to/save/PEFT/model
 
 # alpaca_dataset
 
-torchrun --nnodes 1 --nproc_per_node 4  finetuning.py --enable_fsdp  --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset alpaca_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --output_dir Path/to/save/PEFT/model
+torchrun --nnodes 1 --nproc_per_node 4  finetuning.py --enable_fsdp  --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset alpaca_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --fsdp_config.pure_bf16 --output_dir Path/to/save/PEFT/model
 
 
 # samsum_dataset
 
-torchrun --nnodes 1 --nproc_per_node 4  finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset samsum_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16 --output_dir Path/to/save/PEFT/model
+torchrun --nnodes 1 --nproc_per_node 4  finetuning.py --enable_fsdp --model_name /path_of_model_folder/8B --use_peft --peft_method lora --dataset samsum_dataset --save_model --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --fsdp_config.pure_bf16 --output_dir Path/to/save/PEFT/model
 
 ```
 

+ 1 - 1
recipes/quickstart/finetuning/quickstart_peft_finetuning.ipynb

@@ -90,7 +90,7 @@
     "from llama_recipes.configs import train_config as TRAIN_CONFIG\n",
     "\n",
     "train_config = TRAIN_CONFIG()\n",
-    "train_config.model_name = \"meta-llama/Meta-Llama-3-8B\"\n",
+    "train_config.model_name = \"meta-llama/Meta-Llama-3.1-8B\"\n",
     "train_config.num_epochs = 1\n",
     "train_config.run_validation = False\n",
     "train_config.gradient_accumulation_steps = 4\n",

+ 8 - 5
recipes/quickstart/finetuning/singlegpu_finetuning.md

@@ -15,14 +15,17 @@ To run fine-tuning on a single GPU, we will make use of two packages:
 
 ## How to run it?
 
+**NOTE** To run the fine-tuning with `QLORA`, make sure to set `--peft_method lora` and `--quantization 4bit --quantization_config.quant_type nf4`.
+
+
 ```bash
-python finetuning.py  --use_peft --peft_method lora --quantization --use_fp16 --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model
+FSDP_CPU_RAM_EFFICIENT_LOADING=1 python finetuning.py  --use_peft --peft_method lora --quantization 8bit --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model
 ```
 The args used in the command above are:
 
 * `--use_peft` boolean flag to enable PEFT methods in the script
 * `--peft_method` to specify the PEFT method, here we use `lora` other options are `llama_adapter`, `prefix`.
-* `--quantization` boolean flag to enable int8 quantization
+* `--quantization` string flag to enable 8bit or 4bit quantization
 
 > [!NOTE]
 > In case you are using a multi-GPU machine please make sure to only make one of them visible using `export CUDA_VISIBLE_DEVICES=GPU:id`.
@@ -48,16 +51,16 @@ to run with each of the datasets set the `dataset` flag in the command as shown
 ```bash
 # grammar_dataset
 
-python -m finetuning.py  --use_peft --peft_method lora --quantization  --dataset grammar_dataset --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model
+python -m finetuning.py  --use_peft --peft_method lora --quantization 8bit --dataset grammar_dataset --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model
 
 # alpaca_dataset
 
-python -m finetuning.py  --use_peft --peft_method lora --quantization  --dataset alpaca_dataset --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model
+python -m finetuning.py  --use_peft --peft_method lora --quantization 8bit  --dataset alpaca_dataset --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model
 
 
 # samsum_dataset
 
-python -m finetuning.py  --use_peft --peft_method lora --quantization  --dataset samsum_dataset --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model
+python -m finetuning.py  --use_peft --peft_method lora --quantization 8bit  --dataset samsum_dataset --model_name /path_of_model_folder/8B --output_dir Path/to/save/PEFT/model
 
 ```
 

+ 4 - 3
recipes/quickstart/inference/README.md

@@ -2,6 +2,7 @@
 
 This folder contains scripts to get you started with inference on Meta Llama models.
 
-* [](./code_llama/) contains scripts for tasks relating to code generation using CodeLlama
-* [](./local_inference/) contsin scripts to do memory efficient inference on servers and local machines
-* [](./mobile_inference/) has scripts using MLC to serve Llama on Android (h/t to OctoAI for the contribution!)
+* [Code Llama](./code_llama/) contains scripts for tasks relating to code generation using CodeLlama
+* [Local Inference](./local_inference/) contains scripts to do memory efficient inference on servers and local machines
+* [Mobile Inference](./mobile_inference/) has scripts using MLC to serve Llama on Android (h/t to OctoAI for the contribution!)
+* [Model Update Example](./modelUpgradeExample.py) shows an example of replacing a Llama 3 model with a Llama 3.1 model. 

+ 10 - 7
recipes/quickstart/inference/local_inference/README.md

@@ -27,8 +27,8 @@ samsum_prompt.txt
 ...
 ```
 
-**Note**
-Currently pad token by default in [HuggingFace Tokenizer is `None`](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/tokenization_llama.py#L110). We add the padding token as a special token to the tokenizer, which in this case requires to resize the token_embeddings as shown below:
+**Note on Llama version < 3.1**
+The default padding token in [HuggingFace Tokenizer is `None`](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/tokenization_llama.py#L110). To use padding the padding token needs to be added as a special token to the tokenizer, which in this case requires to resize the token_embeddings as shown below:
 
 ```python
 tokenizer.add_special_tokens(
@@ -39,14 +39,13 @@ tokenizer.add_special_tokens(
     )
 model.resize_token_embeddings(model.config.vocab_size + 1)
 ```
-Padding would be required for batch inference. In this this [example](inference.py), batch size = 1 so essentially padding is not required. However,We added the code pointer as an example in case of batch inference.
-
+Padding would be required for batched inference. In this [example](inference.py), batch size = 1 so essentially padding is not required. However, we added the code pointer as an example in case of batch inference. For Llama version 3.1 use the special token `<|finetune_right_pad_id|> (128004)` for padding.
 
 ## Chat completion
 The inference folder also includes a chat completion example, that adds built-in safety features in fine-tuned models to the prompt tokens. To run the example:
 
 ```bash
-python chat_completion/chat_completion.py --model_name "PATH/TO/MODEL/7B/" --prompt_file chat_completion/chats.json  --quantization --use_auditnlg
+python chat_completion/chat_completion.py --model_name "PATH/TO/MODEL/7B/" --prompt_file chat_completion/chats.json  --quantization 8bit --use_auditnlg
 
 ```
 
@@ -55,7 +54,7 @@ python chat_completion/chat_completion.py --model_name "PATH/TO/MODEL/7B/" --pro
 Setting `use_fast_kernels` will enable using of Flash Attention or Xformer memory-efficient kernels based on the hardware being used. This would speed up inference when used for batched inputs. This has been enabled in `optimum` library from HuggingFace as a one-liner API, please read more [here](https://pytorch.org/blog/out-of-the-box-acceleration/).
 
 ```bash
-python chat_completion/chat_completion.py --model_name "PATH/TO/MODEL/7B/" --prompt_file chat_completion/chats.json  --quantization --use_auditnlg --use_fast_kernels
+python chat_completion/chat_completion.py --model_name "PATH/TO/MODEL/7B/" --prompt_file chat_completion/chats.json  --quantization 8bit --use_auditnlg --use_fast_kernels
 
 python inference.py --model_name <training_config.output_dir> --peft_model <training_config.output_dir> --prompt_file <test_prompt_file> --use_auditnlg --use_fast_kernels
 
@@ -69,7 +68,7 @@ In case you have fine-tuned your model with pure FSDP and saved the checkpoints
 This is helpful if you have fine-tuned you model using FSDP only as follows:
 
 ```bash
-torchrun --nnodes 1 --nproc_per_node 8  recipes/quickstart/finetuning/finetuning.py --enable_fsdp --model_name /path_of_model_folder/7B --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --pure_bf16
+torchrun --nnodes 1 --nproc_per_node 8  recipes/quickstart/finetuning/finetuning.py --enable_fsdp --model_name /path_of_model_folder/7B --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned --fsdp_config.pure_bf16
 ```
 Then convert your FSDP checkpoint to HuggingFace checkpoints using:
 ```bash
@@ -85,3 +84,7 @@ Then run inference using:
 python inference.py --model_name <training_config.output_dir> --prompt_file <test_prompt_file>
 
 ```
+
+## Inference on large models like Meta Llama 405B
+The FP8 quantized variants of Meta Llama (i.e. meta-llama/Meta-Llama-3.1-405B-FP8 and meta-llama/Meta-Llama-3.1-405B-Instruct-FP8) can be executed on a single node with 8x80GB H100 using the scripts located in this folder.
+To run the unquantized Meta Llama 405B variants (i.e. meta-llama/Meta-Llama-3.1-405B and meta-llama/Meta-Llama-3.1-405B-Instruct) we need to use a multi-node setup for inference. The llama-recipes inference script currently does not allow multi-node inference. To run this model you can use vLLM with pipeline and tensor parallelism as showed in [this example](../../../3p_integrations/vllm/README.md).

+ 13 - 11
recipes/quickstart/inference/local_inference/chat_completion/chat_completion.py

@@ -4,6 +4,7 @@
 # from accelerate import init_empty_weights, load_checkpoint_and_dispatch
 
 import fire
+import json
 import os
 import sys
 
@@ -18,7 +19,7 @@ from accelerate.utils import is_xpu_available
 def main(
     model_name,
     peft_model: str=None,
-    quantization: bool=False,
+    quantization: str = None, # Options: 4bit, 8bit
     max_new_tokens =256, #The maximum numbers of tokens to generate
     min_new_tokens:int=0, #The minimum numbers of tokens to generate
     prompt_file: str=None,
@@ -47,33 +48,32 @@ def main(
 
     elif not sys.stdin.isatty():
         dialogs = "\n".join(sys.stdin.readlines())
+        try:
+            dialogs = json.loads(dialogs)
+        except:
+            print("Could not parse json from stdin. Please provide a json file with the user prompts. Exiting.")
+            sys.exit(1)
     else:
         print("No user prompt provided. Exiting.")
         sys.exit(1)
 
     print(f"User dialogs:\n{dialogs}")
     print("\n==================================\n")
-
-
+    
     # Set the seeds for reproducibility
     if is_xpu_available():
         torch.xpu.manual_seed(seed)
     else:
         torch.cuda.manual_seed(seed)
     torch.manual_seed(seed)
-    model = load_model(model_name, quantization, use_fast_kernels)
+
+    model = load_model(model_name, quantization, use_fast_kernels, **kwargs)
     if peft_model:
         model = load_peft_model(model, peft_model)
 
     tokenizer = AutoTokenizer.from_pretrained(model_name)
-    tokenizer.add_special_tokens(
-        {
-
-            "pad_token": "<PAD>",
-        }
-    )
 
-    chats = tokenizer.apply_chat_template(dialogs)
+    chats = [tokenizer.apply_chat_template(dialog) for dialog in dialogs]
 
     with torch.no_grad():
         for idx, chat in enumerate(chats):
@@ -99,12 +99,14 @@ def main(
                 sys.exit(1)  # Exit the program with an error status
             tokens= torch.tensor(chat).long()
             tokens= tokens.unsqueeze(0)
+            attention_mask = torch.ones_like(tokens)
             if is_xpu_available():
                 tokens= tokens.to("xpu:0")
             else:
                 tokens= tokens.to("cuda:0")
             outputs = model.generate(
                 input_ids=tokens,
+                attention_mask=attention_mask,
                 max_new_tokens=max_new_tokens,
                 do_sample=do_sample,
                 top_p=top_p,

+ 144 - 128
recipes/quickstart/inference/local_inference/inference.py

@@ -1,68 +1,46 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
 
-# from accelerate import init_empty_weights, load_checkpoint_and_dispatch
-
-import fire
 import os
 import sys
 import time
+
+import fire
 import gradio as gr
 
 import torch
-from transformers import AutoTokenizer
 
-from llama_recipes.inference.safety_utils import get_safety_checker, AgentType
+from accelerate.utils import is_xpu_available
 from llama_recipes.inference.model_utils import load_model, load_peft_model
 
-from accelerate.utils import is_xpu_available
+from llama_recipes.inference.safety_utils import AgentType, get_safety_checker
+from transformers import AutoTokenizer
+
 
 def main(
     model_name,
-    peft_model: str=None,
-    quantization: bool=False,
-    max_new_tokens =100, #The maximum numbers of tokens to generate
-    prompt_file: str=None,
-    seed: int=42, #seed value for reproducibility
-    do_sample: bool=True, #Whether or not to use sampling ; use greedy decoding otherwise.
-    min_length: int=None, #The minimum length of the sequence to be generated, input prompt + min_new_tokens
-    use_cache: bool=True,  #[optional] Whether or not the model should use the past last key/values attentions Whether or not the model should use the past last key/values attentions (if applicable to the model) to speed up decoding.
-    top_p: float=1.0, # [optional] If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
-    temperature: float=1.0, # [optional] The value used to modulate the next token probabilities.
-    top_k: int=50, # [optional] The number of highest probability vocabulary tokens to keep for top-k-filtering.
-    repetition_penalty: float=1.0, #The parameter for repetition penalty. 1.0 means no penalty.
-    length_penalty: int=1, #[optional] Exponential penalty to the length that is used with beam-based generation.
-    enable_azure_content_safety: bool=False, # Enable safety check with Azure content safety api
-    enable_sensitive_topics: bool=False, # Enable check for sensitive topics using AuditNLG APIs
-    enable_salesforce_content_safety: bool=True, # Enable safety check with Salesforce safety flan t5
-    enable_llamaguard_content_safety: bool=False,
-    max_padding_length: int=None, # the max padding length to be used with tokenizer padding the prompts.
-    use_fast_kernels: bool = False, # Enable using SDPA from PyTroch Accelerated Transformers, make use Flash Attention and Xformer memory-efficient kernels
-    **kwargs
+    peft_model: str = None,
+    quantization: str = None, # Options: 4bit, 8bit
+    max_new_tokens=100,  # The maximum numbers of tokens to generate
+    prompt_file: str = None,
+    seed: int = 42,  # seed value for reproducibility
+    do_sample: bool = True,  # Whether or not to use sampling ; use greedy decoding otherwise.
+    min_length: int = None,  # The minimum length of the sequence to be generated, input prompt + min_new_tokens
+    use_cache: bool = True,  # [optional] Whether or not the model should use the past last key/values attentions Whether or not the model should use the past last key/values attentions (if applicable to the model) to speed up decoding.
+    top_p: float = 1.0,  # [optional] If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+    temperature: float = 1.0,  # [optional] The value used to modulate the next token probabilities.
+    top_k: int = 50,  # [optional] The number of highest probability vocabulary tokens to keep for top-k-filtering.
+    repetition_penalty: float = 1.0,  # The parameter for repetition penalty. 1.0 means no penalty.
+    length_penalty: int = 1,  # [optional] Exponential penalty to the length that is used with beam-based generation.
+    enable_azure_content_safety: bool = False,  # Enable safety check with Azure content safety api
+    enable_sensitive_topics: bool = False,  # Enable check for sensitive topics using AuditNLG APIs
+    enable_salesforce_content_safety: bool = True,  # Enable safety check with Salesforce safety flan t5
+    enable_llamaguard_content_safety: bool = False,
+    max_padding_length: int = None,  # the max padding length to be used with tokenizer padding the prompts.
+    use_fast_kernels: bool = False,  # Enable using SDPA from PyTroch Accelerated Transformers, make use Flash Attention and Xformer memory-efficient kernels
+    share_gradio: bool = False,  # Enable endpoint creation for gradio.live
+    **kwargs,
 ):
-
-  def inference(user_prompt, temperature, top_p, top_k, max_new_tokens, **kwargs,):
-    safety_checker = get_safety_checker(enable_azure_content_safety,
-                                        enable_sensitive_topics,
-                                        enable_salesforce_content_safety,
-                                        enable_llamaguard_content_safety
-                                        )
-
-    # Safety check of the user prompt
-    safety_results = [check(user_prompt) for check in safety_checker]
-    are_safe = all([r[1] for r in safety_results])
-    if are_safe:
-        print("User prompt deemed safe.")
-        print(f"User prompt:\n{user_prompt}")
-    else:
-        print("User prompt deemed unsafe.")
-        for method, is_safe, report in safety_results:
-            if not is_safe:
-                print(method)
-                print(report)
-        print("Skipping the inference as the prompt is not safe.")
-        sys.exit(1)  # Exit the program with an error status
-
     # Set the seeds for reproducibility
     if is_xpu_available():
         torch.xpu.manual_seed(seed)
@@ -70,7 +48,7 @@ def main(
         torch.cuda.manual_seed(seed)
     torch.manual_seed(seed)
 
-    model = load_model(model_name, quantization, use_fast_kernels)
+    model = load_model(model_name, quantization, use_fast_kernels, **kwargs)
     if peft_model:
         model = load_peft_model(model, peft_model)
 
@@ -79,86 +57,124 @@ def main(
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     tokenizer.pad_token = tokenizer.eos_token
 
-    batch = tokenizer(user_prompt, padding='max_length', truncation=True, max_length=max_padding_length, return_tensors="pt")
-    if is_xpu_available():
-        batch = {k: v.to("xpu") for k, v in batch.items()}
-    else:
-        batch = {k: v.to("cuda") for k, v in batch.items()}
-
-    start = time.perf_counter()
-    with torch.no_grad():
-        outputs = model.generate(
-            **batch,
-            max_new_tokens=max_new_tokens,
-            do_sample=do_sample,
-            top_p=top_p,
-            temperature=temperature,
-            min_length=min_length,
-            use_cache=use_cache,
-            top_k=top_k,
-            repetition_penalty=repetition_penalty,
-            length_penalty=length_penalty,
-            **kwargs
+    def inference(
+        user_prompt,
+        temperature,
+        top_p,
+        top_k,
+        max_new_tokens,
+        **kwargs,
+    ):
+        safety_checker = get_safety_checker(
+            enable_azure_content_safety,
+            enable_sensitive_topics,
+            enable_salesforce_content_safety,
+            enable_llamaguard_content_safety,
         )
-    e2e_inference_time = (time.perf_counter()-start)*1000
-    print(f"the inference time is {e2e_inference_time} ms")
-    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
-
-    # Safety check of the model output
-    safety_results = [check(output_text, agent_type=AgentType.AGENT, user_prompt=user_prompt) for check in safety_checker]
-    are_safe = all([r[1] for r in safety_results])
-    if are_safe:
-        print("User input and model output deemed safe.")
-        print(f"Model output:\n{output_text}")
-    else:
-        print("Model output deemed unsafe.")
-        for method, is_safe, report in safety_results:
-            if not is_safe:
-                print(method)
-                print(report)
-    return output_text
-
-  if prompt_file is not None:
-      assert os.path.exists(
-          prompt_file
-      ), f"Provided Prompt file does not exist {prompt_file}"
-      with open(prompt_file, "r") as f:
-          user_prompt = "\n".join(f.readlines())
-      inference(user_prompt, temperature, top_p, top_k, max_new_tokens)
-  elif not sys.stdin.isatty():
-      user_prompt = "\n".join(sys.stdin.readlines())
-      inference(user_prompt, temperature, top_p, top_k, max_new_tokens)
-  else:
-      gr.Interface(
-        fn=inference,
-        inputs=[
-            gr.components.Textbox(
-                lines=9,
-                label="User Prompt",
-                placeholder="none",
-            ),
-            gr.components.Slider(
-                minimum=0, maximum=1, value=1.0, label="Temperature"
-            ),
-            gr.components.Slider(
-                minimum=0, maximum=1, value=1.0, label="Top p"
-            ),
-            gr.components.Slider(
-                minimum=0, maximum=100, step=1, value=50, label="Top k"
-            ),
-            gr.components.Slider(
-                minimum=1, maximum=2000, step=1, value=200, label="Max tokens"
-            ),
-        ],
-        outputs=[
-            gr.components.Textbox(
-                lines=5,
-                label="Output",
+
+        # Safety check of the user prompt
+        safety_results = [check(user_prompt) for check in safety_checker]
+        are_safe = all([r[1] for r in safety_results])
+        if are_safe:
+            print("User prompt deemed safe.")
+            print(f"User prompt:\n{user_prompt}")
+        else:
+            print("User prompt deemed unsafe.")
+            for method, is_safe, report in safety_results:
+                if not is_safe:
+                    print(method)
+                    print(report)
+            print("Skipping the inference as the prompt is not safe.")
+            return  # Exit the program with an error status
+
+        batch = tokenizer(
+            user_prompt,
+            truncation=True,
+            max_length=max_padding_length,
+            return_tensors="pt",
+        )
+        if is_xpu_available():
+            batch = {k: v.to("xpu") for k, v in batch.items()}
+        else:
+            batch = {k: v.to("cuda") for k, v in batch.items()}
+
+        start = time.perf_counter()
+        with torch.no_grad():
+            outputs = model.generate(
+                **batch,
+                max_new_tokens=max_new_tokens,
+                do_sample=do_sample,
+                top_p=top_p,
+                temperature=temperature,
+                min_length=min_length,
+                use_cache=use_cache,
+                top_k=top_k,
+                repetition_penalty=repetition_penalty,
+                length_penalty=length_penalty,
+                **kwargs,
             )
-        ],
-        title="Meta Llama3 Playground",
-        description="https://github.com/facebookresearch/llama-recipes",
-      ).queue().launch(server_name="0.0.0.0", share=True)
+        e2e_inference_time = (time.perf_counter() - start) * 1000
+        print(f"the inference time is {e2e_inference_time} ms")
+        output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+
+        # Safety check of the model output
+        safety_results = [
+            check(output_text, agent_type=AgentType.AGENT, user_prompt=user_prompt)
+            for check in safety_checker
+        ]
+        are_safe = all([r[1] for r in safety_results])
+        if are_safe:
+            print("User input and model output deemed safe.")
+            print(f"Model output:\n{output_text}")
+            return output_text
+        else:
+            print("Model output deemed unsafe.")
+            for method, is_safe, report in safety_results:
+                if not is_safe:
+                    print(method)
+                    print(report)
+            return None
+
+    if prompt_file is not None:
+        assert os.path.exists(
+            prompt_file
+        ), f"Provided Prompt file does not exist {prompt_file}"
+        with open(prompt_file, "r") as f:
+            user_prompt = "\n".join(f.readlines())
+        inference(user_prompt, temperature, top_p, top_k, max_new_tokens)
+    elif not sys.stdin.isatty():
+        user_prompt = "\n".join(sys.stdin.readlines())
+        inference(user_prompt, temperature, top_p, top_k, max_new_tokens)
+    else:
+        gr.Interface(
+            fn=inference,
+            inputs=[
+                gr.components.Textbox(
+                    lines=9,
+                    label="User Prompt",
+                    placeholder="none",
+                ),
+                gr.components.Slider(
+                    minimum=0, maximum=1, value=1.0, label="Temperature"
+                ),
+                gr.components.Slider(minimum=0, maximum=1, value=1.0, label="Top p"),
+                gr.components.Slider(
+                    minimum=0, maximum=100, step=1, value=50, label="Top k"
+                ),
+                gr.components.Slider(
+                    minimum=1, maximum=2000, step=1, value=200, label="Max tokens"
+                ),
+            ],
+            outputs=[
+                gr.components.Textbox(
+                    lines=5,
+                    label="Output",
+                )
+            ],
+            title="Meta Llama3 Playground",
+            description="https://github.com/meta-llama/llama-recipes",
+        ).queue().launch(server_name="0.0.0.0", share=share_gradio)
+
 
 if __name__ == "__main__":
     fire.Fire(main)

+ 51 - 0
recipes/quickstart/inference/modelUpgradeExample.py

@@ -0,0 +1,51 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
+
+# Running the script without any arguments "python modelUpgradeExample.py" performs inference with the Llama 3 8B Instruct model. 
+# Passing  --model-id "meta-llama/Meta-Llama-3.1-8B-Instruct" to the script will switch it to using the Llama 3.1 version of the same model. 
+# The script also shows the input tokens to confirm that the models are responding to the same input
+
+import fire
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+
+def main(model_id = "meta-llama/Meta-Llama-3-8B-Instruct"):
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        torch_dtype=torch.bfloat16,
+        device_map="auto",
+    )
+
+    messages = [
+        {"role": "system", "content": "You are a helpful chatbot"},
+        {"role": "user", "content": "Why is the sky blue?"},
+        {"role": "assistant", "content": "Because the light is scattered"},
+        {"role": "user", "content": "Please tell me more about that"},
+    ]
+
+    input_ids = tokenizer.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        return_tensors="pt",
+    ).to(model.device)
+
+    print("Input tokens:")
+    print(input_ids)
+    
+    attention_mask = torch.ones_like(input_ids)
+    outputs = model.generate(
+        input_ids,
+        max_new_tokens=400,
+        eos_token_id=tokenizer.eos_token_id,
+        do_sample=True,
+        temperature=0.6,
+        top_p=0.9,
+        attention_mask=attention_mask,
+    )
+    response = outputs[0][input_ids.shape[-1]:]
+    print("\nOutput:\n")
+    print(tokenizer.decode(response, skip_special_tokens=True))
+
+if __name__ == "__main__":
+  fire.Fire(main)

+ 10 - 7
recipes/responsible_ai/README.md

@@ -1,11 +1,14 @@
-# Meta Llama Guard
+# Trust and Safety with Llama
 
-Meta Llama Guard and Meta Llama Guard 2 are new models that provide input and output guardrails for LLM inference. For more details, please visit the main [repository](https://github.com/facebookresearch/PurpleLlama/tree/main/Llama-Guard2).
+The [Purple Llama](https://github.com/meta-llama/PurpleLlama/) project provides tools and models to improve LLM security. This folder contains examples to get started with PurpleLlama tools.
 
-**Note** Please find the right model on HF side [here](https://huggingface.co/meta-llama/Meta-Llama-Guard-2-8B).
+| Tool/Model | Description | Get Started
+|---|---|---|
+[Llama Guard](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama-guard-3) | Provide guardrailing on inputs and outputs | [Inference](./llama_guard/inference.py), [Finetuning](./llama_guard/llama_guard_customization_via_prompting_and_fine_tuning.ipynb)
+[Prompt Guard](https://llama.meta.com/docs/model-cards-and-prompt-formats/prompt-guard) | Model to safeguards against jailbreak attempts and embedded prompt injections | [Notebook](./prompt_guard/prompt_guard_tutorial.ipynb)
+[Code Shield](https://github.com/meta-llama/PurpleLlama/tree/main/CodeShield) | Tool to safeguard against insecure code generated by the LLM | [Notebook](https://github.com/meta-llama/PurpleLlama/blob/main/CodeShield/notebook/CodeShieldUsageDemo.ipynb)
 
-### Running locally
-The [llama_guard](llama_guard) folder contains the inference script to run Meta Llama Guard locally. Add test prompts directly to the [inference script](llama_guard/inference.py) before running it.
 
-### Running on the cloud
-The notebooks [Purple_Llama_Anyscale](Purple_Llama_Anyscale.ipynb) & [Purple_Llama_OctoAI](Purple_Llama_OctoAI.ipynb) contain examples for running Meta Llama Guard on cloud hosted endpoints.
+
+### Running on hosted APIs
+The notebooks [input_output_guardrails.ipynb](./input_output_guardrails_with_llama.ipynb),  [Purple_Llama_Anyscale](Purple_Llama_Anyscale.ipynb) & [Purple_Llama_OctoAI](Purple_Llama_OctoAI.ipynb) contain examples for running Meta Llama Guard on cloud hosted endpoints.

+ 8 - 8
recipes/responsible_ai/llama_guard/README.md

@@ -1,8 +1,8 @@
 # Meta Llama Guard demo
 <!-- markdown-link-check-disable -->
-Meta Llama Guard is a language model that provides input and output guardrails for LLM inference. For more details and model cards, please visit the main repository for each model, [Meta Llama Guard](https://github.com/meta-llama/PurpleLlama/tree/main/Llama-Guard) and Meta [Llama Guard 2](https://github.com/meta-llama/PurpleLlama/tree/main/Llama-Guard2).
+Meta Llama Guard is a language model that provides input and output guardrails for LLM inference. For more details and model cards, please visit the [PurpleLlama](https://github.com/meta-llama/PurpleLlama) repository.
 
-This folder contains an example file to run inference with a locally hosted model, either using the Hugging Face Hub or a local path. 
+This folder contains an example file to run inference with a locally hosted model, either using the Hugging Face Hub or a local path.
 
 ## Requirements
 1. Access to Llama guard model weights on Hugging Face. To get access, follow the steps described [here](https://github.com/facebookresearch/PurpleLlama/tree/main/Llama-Guard#download)
@@ -10,7 +10,7 @@ This folder contains an example file to run inference with a locally hosted mode
 
 
 ## Llama Guard inference script
-For testing, you can add User or User/Agent interactions into the prompts list and the run the script to verify the results. When the conversation has one or more Agent responses, it's considered of type agent. 
+For testing, you can add User or User/Agent interactions into the prompts list and the run the script to verify the results. When the conversation has one or more Agent responses, it's considered of type agent.
 
 
 ```
@@ -55,9 +55,9 @@ This is the output:
 
 To run it with a local model, you can use the `model_id` param in the inference script:
 
-`python recipes/responsible_ai/llama_guard/inference.py --model_id=/home/ubuntu/models/llama3/llama_guard_2-hf/ --llama_guard_version=LLAMA_GUARD_2`
+`python recipes/responsible_ai/llama_guard/inference.py --model_id=/home/ubuntu/models/llama3/Llama-Guard-3-8B/ --llama_guard_version=LLAMA_GUARD_3`
 
-Note: Make sure to also add the llama_guard_version if when it does not match the default, the script allows you to run the prompt format from Meta Llama Guard 1 on Meta Llama Guard 2
+Note: Make sure to also add the llama_guard_version; by default it uses LLAMA_GUARD_3
 
 ## Inference Safety Checker
 When running the regular inference script with prompts, Meta Llama Guard will be used as a safety checker on the user prompt and the model output. If both are safe, the result will be shown, else a message with the error will be shown, with the word unsafe and a comma separated list of categories infringed. Meta Llama Guard is always loaded quantized using Hugging Face Transformers library with bitsandbytes.
@@ -66,7 +66,7 @@ In this case, the default categories are applied by the tokenizer, using the `ap
 
 Use this command for testing with a quantized Llama model, modifying the values accordingly:
 
-`python examples/inference.py --model_name <path_to_regular_llama_model> --prompt_file <path_to_prompt_file> --quantization --enable_llamaguard_content_safety`
-
-
+`python examples/inference.py --model_name <path_to_regular_llama_model> --prompt_file <path_to_prompt_file> --quantization 8bit --enable_llamaguard_content_safety`
 
+## Llama Guard 3 Finetuning & Customization
+The safety categories in Llama Guard 3 can be tuned for specific application needs. Existing categories can be removed and new categories can be added to the taxonomy. The [Llama Guard Customization](./llama_guard_customization_via_prompting_and_fine_tuning.ipynb) notebook walks through the process.

+ 2 - 2
recipes/responsible_ai/llama_guard/inference.py

@@ -14,8 +14,8 @@ class AgentType(Enum):
     USER = "User"
 
 def main(
-    model_id: str = "meta-llama/LlamaGuard-7b",
-    llama_guard_version: LlamaGuardVersion = LlamaGuardVersion.LLAMA_GUARD_1
+    model_id: str = "meta-llama/Llama-Guard-3-8B",
+    llama_guard_version: str = "LLAMA_GUARD_3"
 ):
     """
     Entry point for Llama Guard inference sample script.

文件差異過大導致無法顯示
+ 793 - 0
recipes/responsible_ai/llama_guard/llama_guard_customization_via_prompting_and_fine_tuning.ipynb


+ 11 - 0
recipes/responsible_ai/prompt_guard/README.md

@@ -0,0 +1,11 @@
+# Prompt Guard demo
+<!-- markdown-link-check-disable -->
+Prompt Guard is a classifier model that provides input guardrails for LLM inference, particularly against *prompt attacks. For more details and model cards, please visit the main repository, [Meta Prompt Guard](https://github.com/meta-llama/PurpleLlama/tree/main/Prompt-Guard)
+
+This folder contains an example file to run inference with a locally hosted model, either using the Hugging Face Hub or a local path. It also contains a comprehensive demo demonstrating the scenarios in which the model is effective and a script for fine-tuning the model.
+
+This is a very small model and inference and fine-tuning are feasible on local CPUs.
+
+## Requirements
+1. Access to Prompt Guard model weights on Hugging Face. To get access, follow the steps described [here](https://github.com/facebookresearch/PurpleLlama/tree/main/Prompt-Guard#download)
+2. Llama recipes package and it's dependencies [installed](https://github.com/meta-llama/llama-recipes?tab=readme-ov-file#installing)

+ 0 - 0
recipes/responsible_ai/prompt_guard/__init__.py


+ 180 - 0
recipes/responsible_ai/prompt_guard/inference.py

@@ -0,0 +1,180 @@
+import torch
+from torch.nn.functional import softmax
+
+from transformers import (
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+)
+
+"""
+Utilities for loading the PromptGuard model and evaluating text for jailbreaks and indirect injections.
+
+Note that the underlying model has a maximum recommended input size of 512 tokens as a DeBERTa model.
+The final two functions in this file implement efficient parallel batched evaluation of the model on a list
+of input strings of arbirary length, with the final score for each input being the maximum score across all
+chunks of the input string.
+"""
+
+
+def load_model_and_tokenizer(model_name='meta-llama/Prompt-Guard-86M'):
+    """
+    Load the PromptGuard model from Hugging Face or a local model.
+    
+    Args:
+        model_name (str): The name of the model to load. Default is 'meta-llama/Prompt-Guard-86M'.
+        
+    Returns:
+        transformers.PreTrainedModel: The loaded model.
+    """
+    model = AutoModelForSequenceClassification.from_pretrained(model_name)
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    return model, tokenizer
+
+
+def get_class_probabilities(model, tokenizer, text, temperature=1.0, device='cpu'):
+    """
+    Evaluate the model on the given text with temperature-adjusted softmax.
+    Note, as this is a DeBERTa model, the input text should have a maximum length of 512.
+    
+    Args:
+        text (str): The input text to classify.
+        temperature (float): The temperature for the softmax function. Default is 1.0.
+        device (str): The device to evaluate the model on.
+        
+    Returns:
+        torch.Tensor: The probability of each class adjusted by the temperature.
+    """
+    # Encode the text
+    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
+    inputs = inputs.to(device)
+    # Get logits from the model
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    # Apply temperature scaling
+    scaled_logits = logits / temperature
+    # Apply softmax to get probabilities
+    probabilities = softmax(scaled_logits, dim=-1)
+    return probabilities
+
+
+def get_jailbreak_score(model, tokenizer, text, temperature=1.0, device='cpu'):
+    """
+    Evaluate the probability that a given string contains malicious jailbreak or prompt injection.
+    Appropriate for filtering dialogue between a user and an LLM.
+    
+    Args:
+        text (str): The input text to evaluate.
+        temperature (float): The temperature for the softmax function. Default is 1.0.
+        device (str): The device to evaluate the model on.
+        
+    Returns:
+        float: The probability of the text containing malicious content.
+    """
+    probabilities = get_class_probabilities(model, tokenizer, text, temperature, device)
+    return probabilities[0, 2].item()
+
+
+def get_indirect_injection_score(model, tokenizer, text, temperature=1.0, device='cpu'):
+    """
+    Evaluate the probability that a given string contains any embedded instructions (malicious or benign).
+    Appropriate for filtering third party inputs (e.g. web searches, tool outputs) into an LLM.
+    
+    Args:
+        text (str): The input text to evaluate.
+        temperature (float): The temperature for the softmax function. Default is 1.0.
+        device (str): The device to evaluate the model on.
+        
+    Returns:
+        float: The combined probability of the text containing malicious or embedded instructions.
+    """
+    probabilities = get_class_probabilities(model, tokenizer, text, temperature, device)
+    return (probabilities[0, 1] + probabilities[0, 2]).item()
+
+
+def process_text_batch(model, tokenizer, texts, temperature=1.0, device='cpu'):
+    """
+    Process a batch of texts and return their class probabilities.
+    Args:
+        model (transformers.PreTrainedModel): The loaded model.
+        tokenizer (transformers.PreTrainedTokenizer): The tokenizer for the model.
+        texts (list[str]): A list of texts to process.
+        temperature (float): The temperature for the softmax function.
+        device (str): The device to evaluate the model on.
+        
+    Returns:
+        torch.Tensor: A tensor containing the class probabilities for each text in the batch.
+    """
+    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
+    inputs = inputs.to(device)
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    scaled_logits = logits / temperature
+    probabilities = softmax(scaled_logits, dim=-1)
+    return probabilities
+
+
+def get_scores_for_texts(model, tokenizer, texts, score_indices, temperature=1.0, device='cpu', max_batch_size=16):
+    """
+    Compute scores for a list of texts, handling texts of arbitrary length by breaking them into chunks and processing in parallel.
+    Args:
+        model (transformers.PreTrainedModel): The loaded model.
+        tokenizer (transformers.PreTrainedTokenizer): The tokenizer for the model.
+        texts (list[str]): A list of texts to evaluate.
+        score_indices (list[int]): Indices of scores to sum for final score calculation.
+        temperature (float): The temperature for the softmax function.
+        device (str): The device to evaluate the model on.
+        max_batch_size (int): The maximum number of text chunks to process in a single batch.
+        
+    Returns:
+        list[float]: A list of scores for each text.
+    """
+    all_chunks = []
+    text_indices = []
+    for index, text in enumerate(texts):
+        chunks = [text[i:i+512] for i in range(0, len(text), 512)]
+        all_chunks.extend(chunks)
+        text_indices.extend([index] * len(chunks))
+    all_scores = [0] * len(texts)
+    for i in range(0, len(all_chunks), max_batch_size):
+        batch_chunks = all_chunks[i:i+max_batch_size]
+        batch_indices = text_indices[i:i+max_batch_size]
+        probabilities = process_text_batch(model, tokenizer, batch_chunks, temperature, device)
+        scores = probabilities[:, score_indices].sum(dim=1).tolist()
+        
+        for idx, score in zip(batch_indices, scores):
+            all_scores[idx] = max(all_scores[idx], score)
+    return all_scores
+
+
+def get_jailbreak_scores_for_texts(model, tokenizer, texts, temperature=1.0, device='cpu', max_batch_size=16):
+    """
+    Compute jailbreak scores for a list of texts.
+    Args:
+        model (transformers.PreTrainedModel): The loaded model.
+        tokenizer (transformers.PreTrainedTokenizer): The tokenizer for the model.
+        texts (list[str]): A list of texts to evaluate.
+        temperature (float): The temperature for the softmax function.
+        device (str): The device to evaluate the model on.
+        max_batch_size (int): The maximum number of text chunks to process in a single batch.
+        
+    Returns:
+        list[float]: A list of jailbreak scores for each text.
+    """
+    return get_scores_for_texts(model, tokenizer, texts, [2], temperature, device, max_batch_size)
+
+
+def get_indirect_injection_scores_for_texts(model, tokenizer, texts, temperature=1.0, device='cpu', max_batch_size=16):
+    """
+    Compute indirect injection scores for a list of texts.
+    Args:
+        model (transformers.PreTrainedModel): The loaded model.
+        tokenizer (transformers.PreTrainedTokenizer): The tokenizer for the model.
+        texts (list[str]): A list of texts to evaluate.
+        temperature (float): The temperature for the softmax function.
+        device (str): The device to evaluate the model on.
+        max_batch_size (int): The maximum number of text chunks to process in a single batch.
+        
+    Returns:
+        list[float]: A list of indirect injection scores for each text.
+    """
+    return get_scores_for_texts(model, tokenizer, texts, [1, 2], temperature, device, max_batch_size)

文件差異過大導致無法顯示
+ 817 - 0
recipes/responsible_ai/prompt_guard/prompt_guard_tutorial.ipynb


+ 3 - 6
recipes/use_cases/README.md

@@ -7,17 +7,14 @@ This demo app shows how to use LangChain and Llama 3 to let users ask questions
 ## [live_data](live_data.ipynb): Ask Llama 3 about Live Data (using Replicate or [OctoAI](../3p_integrations/octoai/live_data.ipynb))
 This demo app shows how to perform live data augmented generation tasks with Llama 3, [LlamaIndex](https://github.com/run-llama/llama_index), another leading open-source framework for building LLM apps, and the [Tavily](https://tavily.com) live search API.
 
-## [WhatsApp Chatbot](./customerservice_chatbots/whatsapp_llama/whatsapp_llama3.md): Building a Llama 3 Enabled WhatsApp Chatbot
+## [WhatsApp Chatbot](./customerservice_chatbots/whatsapp_chatbot/whatsapp_llama3.md): Building a Llama 3 Enabled WhatsApp Chatbot
 This step-by-step tutorial shows how to use the [WhatsApp Business API](https://developers.facebook.com/docs/whatsapp/cloud-api/overview) to build a Llama 3 enabled WhatsApp chatbot.
 
-## [Messenger Chatbot](./customerservice_chatbots/messenger_llama/messenger_llama3.md): Building a Llama 3 Enabled Messenger Chatbot
+## [Messenger Chatbot](./customerservice_chatbots/messenger_chatbot/messenger_llama3.md): Building a Llama 3 Enabled Messenger Chatbot
 This step-by-step tutorial shows how to use the [Messenger Platform](https://developers.facebook.com/docs/messenger-platform/overview) to build a Llama 3 enabled Messenger chatbot.
 
 ### RAG Chatbot Example (running [locally](./customerservice_chatbots/RAG_chatbot/RAG_Chatbot_Example.ipynb) or on [OctoAI](../3p_integrations/octoai/RAG_chatbot_example/RAG_chatbot_example.ipynb))
 A complete example of how to build a Llama 3 chatbot hosted on your browser that can answer questions based on your own data using retrieval augmented generation (RAG). You can run Llama2 locally if you have a good enough GPU or on OctoAI if you follow the note [here](../README.md#octoai_note).
 
-## [Sales Bot](./customerservice_chatbots/sales_bot/SalesBot.ipynb): Sales Bot with Llama3 - A Summarization and RAG Use Case
+## [Sales Bot](./customerservice_chatbots/ai_agent_chatbot/SalesBot.ipynb): Sales Bot with Llama3 - A Summarization and RAG Use Case
 An summarization + RAG use case built around the Amazon product review Kaggle dataset to build a helpful Music Store Sales Bot. The summarization and RAG are built on top of Llama models hosted on OctoAI, and the vector database is hosted on Weaviate Cloud Services.
-
-## [Media Generation](./MediaGen.ipynb): Building a Video Generation Pipeline with Llama3
-This step-by-step tutorial shows how to use leverage Llama 3 to drive the generation of animated videos using SDXL and SVD. More specifically it relies on JSON formatting to produce a scene-by-scene story board of a recipe video. The user provides the name of a dish, then Llama 3 describes a step by step guide to reproduce the said dish. This step by step guide is brought to life with models like SDXL and SVD.

+ 1 - 1
recipes/use_cases/customerservice_chatbots/RAG_chatbot/RAG_Chatbot_Example.ipynb

@@ -418,7 +418,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "model=meta-llama/Meta-Llama-3-8B-Instruct\n",
+    "model=meta-llama/Meta-Llama-3.1-8B-Instruct\n",
     "volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run\n",
     "token=#your-huggingface-token\n",
     "docker run --gpus all --shm-size 1g -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0 --model-id $model"

+ 3 - 3
recipes/use_cases/customerservice_chatbots/RAG_chatbot/vectorstore/mongodb/rag_mongodb_llama3_huggingface_open_source.ipynb

@@ -934,11 +934,11 @@
       "source": [
         "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
         "import torch\n",
-        "tokenizer = AutoTokenizer.from_pretrained(\"meta-llama/Meta-Llama-3-8B-Instruct\")\n",
+        "tokenizer = AutoTokenizer.from_pretrained(\"meta-llama/Meta-Llama-3.1-8B-Instruct\")\n",
         "# CPU Enabled uncomment below 👇🏽\n",
-        "# model = AutoModelForCausalLM.from_pretrained(\"meta-llama/Meta-Llama-3-8B-Instruct\")\n",
+        "# model = AutoModelForCausalLM.from_pretrained(\"meta-llama/Meta-Llama-3.1-8B-Instruct\")\n",
         "# GPU Enabled use below 👇🏽\n",
-        "model = AutoModelForCausalLM.from_pretrained(\"meta-llama/Meta-Llama-3-8B-Instruct\", torch_dtype=torch.bfloat16, device_map=\"auto\")"
+        "model = AutoModelForCausalLM.from_pretrained(\"meta-llama/Meta-Llama-3.1-8B-Instruct\", torch_dtype=torch.bfloat16, device_map=\"auto\")"
       ]
     },
     {

recipes/use_cases/customerservice_chatbots/sales_bot/SalesBot.ipynb → recipes/use_cases/customerservice_chatbots/ai_agent_chatbot/SalesBot.ipynb


recipes/use_cases/customerservice_chatbots/sales_bot/Musical_instruments_reviews.csv → recipes/use_cases/customerservice_chatbots/ai_agent_chatbot/musical_instruments_reviews.csv


recipes/use_cases/customerservice_chatbots/messenger_llama/llama_messenger.py → recipes/use_cases/customerservice_chatbots/messenger_chatbot/llama_messenger.py


+ 2 - 2
recipes/use_cases/customerservice_chatbots/messenger_llama/messenger_llama3.md

@@ -2,7 +2,7 @@
 
 This step-by-step tutorial shows the complete process of building a Llama-enabled Messenger chatbot. A demo video of using the iOS Messenger app to send a question to a Facebook business page and receive the Llama 3 generated answer is [here](https://drive.google.com/file/d/1B4ijFH4X3jEHZfkGdTPmdsgpUes_RNud/view).
 
-If you're interested in a Llama 3 enabled WhatsApp chatbot, see [here](../whatsapp_llama/whatsapp_llama3.md) for a tutorial.
+If you're interested in a Llama 3 enabled WhatsApp chatbot, see [here](../whatsapp_chatbot/whatsapp_llama3.md) for a tutorial.
 
 ## Overview
 
@@ -16,7 +16,7 @@ The diagram below shows the components and overall data flow of the Llama 3 enab
 
 1. A Facebook Page is required to send and receive messages using the Messenger Platform - see [here](https://www.facebook.com/business/help/461775097570076?id=939256796236247) for details about Facebook Pages and how to create a new page.
 
-2. If you have followed the [Llama WhatsApp chatbot tutorial](../whatsapp_llama/whatsapp_llama3.md), or if you already have a Meta developer account and a business app, then you can skip this step. Otherwise, you need to first [create a Meta developer account](https://developers.facebook.com/) and then [create a business app](https://developers.facebook.com/docs/development/create-an-app/).
+2. If you have followed the [Llama WhatsApp chatbot tutorial](../whatsapp_chatbot/whatsapp_llama3.md), or if you already have a Meta developer account and a business app, then you can skip this step. Otherwise, you need to first [create a Meta developer account](https://developers.facebook.com/) and then [create a business app](https://developers.facebook.com/docs/development/create-an-app/).
 
 3. Add the Messenger product to your business app by going to your business app's Dashboard, click "Add Product" and then select "Messenger".
 

recipes/use_cases/customerservice_chatbots/whatsapp_llama/llama_chatbot.py → recipes/use_cases/customerservice_chatbots/whatsapp_chatbot/llama_chatbot.py


+ 1 - 1
recipes/use_cases/customerservice_chatbots/whatsapp_llama/whatsapp_llama3.md

@@ -2,7 +2,7 @@
 
 This step-by-step tutorial shows the complete process of building a Llama 3 enabled WhatsApp chatbot. A demo video of using the iOS WhatsApp to send a question to a test phone number and receive the Llama 3 generated answer is [here](https://drive.google.com/file/d/1fZDaOsvyE1yrNGETV-e0SvL14BYeAI6R/view).
 
-If you're interested in a Llama 3 enabled Messenger chatbot, see [here](../messenger_llama/messenger_llama3.md) for a tutorial.
+If you're interested in a Llama 3 enabled Messenger chatbot, see [here](../messenger_chatbot/messenger_llama3.md) for a tutorial.
 
 ## Overview
 

+ 2 - 1
requirements.txt

@@ -8,7 +8,7 @@ black[jupyter]
 datasets
 fire
 peft
-transformers>=4.40.0
+transformers>=4.43.1
 sentencepiece
 py7zr
 scipy
@@ -28,3 +28,4 @@ langchain_openai
 langchain
 langchain_community
 sentence_transformers
+codeshield

+ 1 - 0
src/llama_recipes/configs/__init__.py

@@ -5,3 +5,4 @@ from llama_recipes.configs.peft import lora_config, llama_adapter_config, prefix
 from llama_recipes.configs.fsdp import fsdp_config
 from llama_recipes.configs.training import train_config
 from llama_recipes.configs.wandb import wandb_config
+from llama_recipes.configs.quantization import quantization_config

+ 7 - 2
src/llama_recipes/configs/datasets.py

@@ -25,11 +25,16 @@ class alpaca_dataset:
     test_split: str = "val"
     data_path: str = "src/llama_recipes/datasets/alpaca_data.json"
 
-
 @dataclass
 class custom_dataset:
     dataset: str = "custom_dataset"
-    file: str = "recipes/finetuning/datasets/custom_dataset.py"
+    file: str = "recipes/quickstart/finetuning/datasets/custom_dataset.py"
     train_split: str = "train"
     test_split: str = "validation"
     data_path: str = ""
+    
+@dataclass
+class llamaguard_toxicchat_dataset:
+    dataset: str = "llamaguard_toxicchat_dataset"
+    train_split: str = "train"
+    test_split: str = "test"

+ 30 - 0
src/llama_recipes/configs/quantization.py

@@ -0,0 +1,30 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
+
+from dataclasses import dataclass
+from typing import Optional
+import torch
+from transformers import BitsAndBytesConfig
+
+@dataclass
+class quantization_config:
+    quant_type: str =  "fp4" # "fp4" or "nf4"
+    compute_dtype: torch.dtype = torch.bfloat16
+    use_double_quant: bool = False
+    quant_storage: torch.dtype = torch.bfloat16
+
+    def create_bnb_config(self, quantization: str) -> BitsAndBytesConfig:
+        if quantization not in {"4bit", "8bit"}:
+            raise ValueError("quantization must be either '4bit' or '8bit'")
+
+        if quantization == "4bit":
+            config_params = {
+                "bnb_4bit_quant_type": self.quant_type,
+                "bnb_4bit_compute_dtype": self.compute_dtype,
+                "bnb_4bit_use_double_quant": self.use_double_quant,
+                "bnb_4bit_quant_storage": self.quant_storage,
+            }
+            
+            return BitsAndBytesConfig(load_in_4bit=True, **config_params)
+        else:
+            return BitsAndBytesConfig(load_in_8bit=True)

+ 1 - 1
src/llama_recipes/configs/training.py

@@ -35,7 +35,7 @@ class train_config:
     output_dir: str = "PATH/to/save/PEFT/model"
     freeze_layers: bool = False
     num_freeze_layers: int = 1
-    quantization: bool = False
+    quantization: str = None
     one_gpu: bool = False
     save_model: bool = True
     dist_checkpoint_root_folder: str="PATH/to/save/FSDP/model" # will be used if using FSDP

+ 2 - 1
src/llama_recipes/datasets/__init__.py

@@ -3,4 +3,5 @@
 
 from llama_recipes.datasets.grammar_dataset.grammar_dataset import get_dataset as get_grammar_dataset
 from llama_recipes.datasets.alpaca_dataset import InstructionDataset as get_alpaca_dataset
-from llama_recipes.datasets.samsum_dataset import get_preprocessed_samsum as get_samsum_dataset
+from llama_recipes.datasets.samsum_dataset import get_preprocessed_samsum as get_samsum_dataset
+from llama_recipes.datasets.toxicchat_dataset import get_llamaguard_toxicchat_dataset as get_llamaguard_toxicchat_dataset

+ 131 - 0
src/llama_recipes/datasets/toxicchat_dataset.py

@@ -0,0 +1,131 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed according to the terms of the Llama 3.1 Community License Agreement.
+
+# For dataset details visit: https://huggingface.co/datasets/lmsys/toxic-chat
+
+import copy
+import datasets
+import itertools
+from llama_recipes.inference.prompt_format_utils import  LLAMA_GUARD_3_CATEGORY
+import ast
+import fire
+
+def tokenize_prompt_and_labels(full_prompt, tokenizer):
+        prompt_tokens = tokenizer.encode(full_prompt)
+        combined_tokens = {
+            "input_ids": list(prompt_tokens),
+            "labels": list(prompt_tokens)
+        }
+        return dict(combined_tokens, attention_mask=[1]*len(combined_tokens["input_ids"]))
+    
+
+from llama_recipes.data.llama_guard.finetuning_data_formatter import TrainingExample, Guidelines, Category, LlamaGuardPromptConfigs, LlamaGuardGenerationConfigs, ExplanationPosition, AugmentationConfigs, FormatterConfigs, create_formatted_finetuning_examples
+from datasets import Dataset, DatasetInfo
+
+def mapTcCategoriesToLGCategories(TcCategoriesString):
+    TcCategories = ast.literal_eval(TcCategoriesString)
+    if(len(TcCategories)==0):
+         return None
+    ranked = sorted(TcCategories, key=lambda x: x[1], reverse=True)
+    primary = ranked[0][0] if len(ranked) else None
+    TcMapping = {
+        "sexual":"012", 
+        "violence":"01", 
+        "sexual/minors":"04", 
+        "self-harm/intent":"011", 
+        "hate":"010", 
+        "harassment":"010", 
+        "self-harm":"011", 
+        "self-harm/instructions":"011", 
+        "violence/graphic":"01", 
+        "harassment/threatening":"010", 
+        "hate/threatening":"010"
+    }
+    return TcMapping[primary]
+
+def get_llamaguard_toxicchat_dataset(dataset_config, tokenizer, split, return_jsonl=False):
+    dataset = datasets.load_dataset("lmsys/toxic-chat", 'toxicchat0124', split=split)
+
+
+    if return_jsonl:
+        jsonl_data = [];
+        for x in dataset.to_iterable_dataset():
+            jsonl_data.append({
+                    "prompt":x['user_input'],
+                    "generation":x['model_output'],
+                    "label":("good" if x['toxicity'] == 0 else "bad"),
+                    "unsafe_content": [mapTcCategoriesToLGCategories(x['openai_moderation'])]
+                    })
+        return jsonl_data
+    
+    guidelines = Guidelines(
+        categories=LLAMA_GUARD_3_CATEGORY,
+        category_code_prefix="O")
+    
+    # Setting up configs
+    llama_guard_prompt_configs = LlamaGuardPromptConfigs(
+        instructions_format_string="""<|begin_of_text|><|start_header_id|>user<|end_header_id|> Task: Check if there is unsafe content in 'User' messages in conversations according our safety policy with the below categories.
+
+    <BEGIN UNSAFE CONTENT CATEGORIES>
+    {guidelines}
+    <END UNSAFE CONTENT CATEGORIES>
+
+    <BEGIN CONVERSATION>
+
+    {conversation}
+
+    <END CONVERSATION>
+
+    Provide your safety assessment for $agent_type in the above conversation:
+    - First line must read 'safe' or 'unsafe'.
+    - If unsafe, a second line must include a comma-separated list of violated categories. <|eot_id|><|start_header_id|>assistant<|end_header_id|>""",
+        should_include_category_descriptions=False,
+        should_shuffle_category_codes=False
+    )
+
+    llama_guard_generation_configs = LlamaGuardGenerationConfigs(
+        should_list_violated_codes=True,
+        explanation_position=None
+    )
+
+    augmentation_configs = AugmentationConfigs(
+        should_add_examples_with_dropped_nonviolated_prompt_categories=False,
+        should_add_examples_with_dropped_violated_and_nonviolated_prompt_categories=False,
+        explanation_for_augmentation_with_dropped_violated_and_nonviolated_prompt_categories="Response is safe because any riskiness in it is related to categories we're not trying to detect."
+    )
+
+    formatter_configs = FormatterConfigs(
+        guidelines=guidelines,
+        llama_guard_prompt_configs=llama_guard_prompt_configs,
+        llama_guard_generation_configs=llama_guard_generation_configs,
+        augmentation_configs=augmentation_configs,
+        random_seed=42
+    )
+
+    dataset = dataset.map(lambda x: {"full_prompt": create_formatted_finetuning_examples(
+        [TrainingExample(
+            prompt=x["user_input"],
+            response=None,
+            violated_category_codes = [] if x["toxicity"]==0 else [mapTcCategoriesToLGCategories(x["openai_moderation"])],
+            label="safe" if x["toxicity"]==0 else "unsafe",
+            explanation="The response contains violating information."
+        )],
+        formatter_configs)[0]}, 
+        remove_columns=list(dataset.features))
+
+    dataset = dataset.map(lambda x: tokenize_prompt_and_labels(x["full_prompt"], tokenizer), remove_columns=list(dataset.features))
+    return dataset
+
+def main(return_jsonl = False):
+    from transformers import AutoTokenizer
+    model_id: str = "/home/ubuntu/LG3-interim-hf-weights"
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    if return_jsonl:
+        dataset = get_llamaguard_toxicchat_dataset(None, tokenizer, "train", return_jsonl = True)
+        print(dataset[0:50])
+    else:
+        dataset = get_llamaguard_toxicchat_dataset(None, tokenizer, "train")
+        print(dataset[0])
+
+if __name__ == '__main__':
+    fire.Fire(main)

+ 33 - 42
src/llama_recipes/finetuning.py

@@ -1,6 +1,7 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
 
+from collections import Counter
 import os
 
 import dataclasses
@@ -8,7 +9,7 @@ import fire
 import random
 import torch
 import torch.optim as optim
-from peft import get_peft_model, prepare_model_for_kbit_training, PeftModel
+from peft import get_peft_model, PeftModel
 from torch.distributed.fsdp import (
     FullyShardedDataParallel as FSDP,
     ShardingStrategy
@@ -18,6 +19,7 @@ from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload
 from torch.optim.lr_scheduler import StepLR
 from transformers import (
     AutoTokenizer,
+    BitsAndBytesConfig,
     LlamaForCausalLM,
     LlamaConfig,
 )
@@ -25,6 +27,7 @@ from transformers.models.llama.modeling_llama import LlamaDecoderLayer
 
 from llama_recipes.configs import fsdp_config as FSDP_CONFIG
 from llama_recipes.configs import train_config as TRAIN_CONFIG
+from llama_recipes.configs import quantization_config  as QUANTIZATION_CONFIG
 from llama_recipes.data.concatenator import ConcatDataset
 from llama_recipes.policies import AnyPrecisionAdamW, apply_fsdp_checkpointing
 
@@ -48,6 +51,7 @@ from llama_recipes.utils.train_utils import (
     get_policies,
 )
 from accelerate.utils import is_xpu_available
+from warnings import warn
 
 def setup_wandb(train_config, fsdp_config, **kwargs):
     try:
@@ -66,7 +70,6 @@ def setup_wandb(train_config, fsdp_config, **kwargs):
     run.config.update(fsdp_config, allow_val_change=True)
     return run
 
-
 def main(**kwargs):
     # Update the configuration for the training and sharding process
     train_config, fsdp_config = TRAIN_CONFIG(), FSDP_CONFIG()
@@ -97,38 +100,31 @@ def main(**kwargs):
     if train_config.use_wandb:
         if not train_config.enable_fsdp or rank==0:
             wandb_run = setup_wandb(train_config, fsdp_config, **kwargs)
+    
+    #setting quantization configs
+    bnb_config = None
+    if train_config.quantization:
+        if type(train_config.quantization) == type(True):
+            warn("Quantization (--quantization) is a boolean, please specify quantization as '4bit' or '8bit'. Defaulting to '8bit' but this might change in the future.", FutureWarning)
+            train_config.quantization = "8bit"
+
+        if train_config.quantization == "8bit" and train_config.enable_fsdp:
+            raise ValueError("8bit quantization is not supported with FSDP, please use 4bit quantization")
+
+        quant_config = QUANTIZATION_CONFIG()
+        update_config(quant_config, **kwargs)
+        bnb_config = quant_config.create_bnb_config(train_config.quantization)
 
     # Load the pre-trained model and setup its configuration
     use_cache = False if train_config.enable_fsdp else None
-    if train_config.enable_fsdp and train_config.low_cpu_fsdp:
-        """
-        for FSDP, we can save cpu memory by loading pretrained model on rank0 only.
-        this avoids cpu oom when loading large models like llama 70B, in which case
-        model alone would consume 2+TB cpu mem (70 * 4 * 8). This will add some comms
-        overhead and currently requires latest nightly.
-        """
-        if rank == 0:
-            model = LlamaForCausalLM.from_pretrained(
-                train_config.model_name,
-                load_in_8bit=True if train_config.quantization else None,
-                device_map="auto" if train_config.quantization else None,
-                use_cache=use_cache,
-                attn_implementation="sdpa" if train_config.use_fast_kernels else None,
-            )
-        else:
-            llama_config = LlamaConfig.from_pretrained(train_config.model_name)
-            llama_config.use_cache = use_cache
-            with torch.device("meta"):
-                model = LlamaForCausalLM(llama_config)
-
-    else:
-        model = LlamaForCausalLM.from_pretrained(
-            train_config.model_name,
-            load_in_8bit=True if train_config.quantization else None,
-            device_map="auto" if train_config.quantization else None,
-            use_cache=use_cache,
-            attn_implementation="sdpa" if train_config.use_fast_kernels else None,
-        )
+    model = LlamaForCausalLM.from_pretrained(
+        train_config.model_name,
+        quantization_config=bnb_config,
+        use_cache=use_cache,
+        attn_implementation="sdpa" if train_config.use_fast_kernels else None,
+        device_map="auto" if train_config.quantization and not train_config.enable_fsdp else None,
+        torch_dtype=torch.float16 if train_config.use_fp16 else torch.bfloat16,
+    )
 
     # Load the tokenizer and add special tokens
     tokenizer = AutoTokenizer.from_pretrained(train_config.model_name if train_config.tokenizer_name is None else train_config.tokenizer_name)
@@ -142,14 +138,10 @@ def main(**kwargs):
 
     print_model_size(model, train_config, rank if train_config.enable_fsdp else 0)
 
-    # Prepare the model for int8 training if quantization is enabled
-    if train_config.quantization:
-        model = prepare_model_for_kbit_training(model)
-
     # Convert the model to bfloat16 if fsdp and pure_bf16 is enabled
-    if train_config.enable_fsdp and fsdp_config.pure_bf16:
+    if train_config.enable_fsdp and fsdp_config.pure_bf16 and not train_config.quantization:
         model.to(torch.bfloat16)
-
+        
     if train_config.use_peft:
         # Load the pre-trained peft model checkpoint and setup its configuration
         if train_config.from_peft_checkpoint:
@@ -181,7 +173,6 @@ def main(**kwargs):
             device_id = torch.xpu.current_device()
         elif torch.cuda.is_available():
             device_id = torch.cuda.current_device()
-
         model = FSDP(
             model,
             auto_wrap_policy= my_auto_wrapping_policy if train_config.use_peft else wrapping_policy,
@@ -195,8 +186,10 @@ def main(**kwargs):
             param_init_fn=(lambda module: module.to_empty(device=torch.device("cuda"), recurse=False))
             if train_config.low_cpu_fsdp and rank != 0 else None,
         )
-        if fsdp_config.fsdp_activation_checkpointing:
-            apply_fsdp_checkpointing(model)
+        if fsdp_config.fsdp_activation_checkpointing:            
+            model.enable_input_require_grads()
+            model.gradient_checkpointing_enable()
+            apply_fsdp_checkpointing(model)                      
     elif not train_config.quantization and not train_config.enable_fsdp:
         if is_xpu_available():
             model.to("xpu:0")
@@ -211,7 +204,6 @@ def main(**kwargs):
         dataset_config,
         split="train",
     )
-
     if not train_config.enable_fsdp or rank == 0:
         print(f"--> Training Set Length = {len(dataset_train)}")
 
@@ -271,7 +263,6 @@ def main(**kwargs):
             weight_decay=train_config.weight_decay,
         )
     scheduler = StepLR(optimizer, step_size=1, gamma=train_config.gamma)
-
     # Start the training process
     results = train(
         model,

+ 22 - 5
src/llama_recipes/inference/model_utils.py

@@ -1,19 +1,36 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # This software may be used and distributed according to the terms of the GNU General Public License version 3.
 
+from llama_recipes.utils.config_utils import update_config
+from llama_recipes.configs import quantization_config  as QUANT_CONFIG
 from peft import PeftModel
 from transformers import AutoModelForCausalLM, LlamaForCausalLM, LlamaConfig
+from warnings import warn
 
 # Function to load the main model for text generation
-def load_model(model_name, quantization, use_fast_kernels):
+def load_model(model_name, quantization, use_fast_kernels, **kwargs):
+    if type(quantization) == type(True):
+            warn("Quantization (--quantization) is a boolean, please specify quantization as '4bit' or '8bit'. Defaulting to '8bit' but this might change in the future.", FutureWarning)
+            quantization = "8bit"
+
+    bnb_config = None
+    if quantization:
+        quant_config = QUANT_CONFIG()
+        update_config(quant_config, **kwargs)
+        bnb_config = quant_config.create_bnb_config(quantization)
+
     print(f"use_fast_kernels{use_fast_kernels}")
+
+    kwargs = {}
+    if bnb_config:
+        kwargs["quantization_config"]=bnb_config
+    kwargs["device_map"]="auto"
+    kwargs["low_cpu_mem_usage"]=True
+    kwargs["attn_implementation"]="sdpa" if use_fast_kernels else None
     model = AutoModelForCausalLM.from_pretrained(
         model_name,
         return_dict=True,
-        load_in_8bit=quantization,
-        device_map="auto",
-        low_cpu_mem_usage=True,
-        attn_implementation="sdpa" if use_fast_kernels else None,
+        **kwargs,
     )
     return model
 

+ 95 - 5
src/llama_recipes/inference/prompt_format_utils.py

@@ -9,7 +9,8 @@ from enum import Enum
 class LlamaGuardVersion(Enum):
     LLAMA_GUARD_1 = "Llama Guard 1"
     LLAMA_GUARD_2 = "Llama Guard 2"
-
+    LLAMA_GUARD_3 = "Llama Guard 3"
+    
 class AgentType(Enum):
     AGENT = "Agent"
     USER = "User"
@@ -26,6 +27,7 @@ class ConversationTurn:
 
 LLAMA_GUARD_1_CATEGORY_SHORT_NAME_PREFIX = "O"
 LLAMA_GUARD_2_CATEGORY_SHORT_NAME_PREFIX = "S"
+LLAMA_GUARD_3_CATEGORY_SHORT_NAME_PREFIX = "S"
 
 PROMPT_TASK = "Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories."
 
@@ -56,6 +58,8 @@ Provide your safety assessment for ONLY THE LAST $agent_type message in the abov
 
 PROMPT_TEMPLATE_1 = Template(f"[INST] {PROMPT_TASK}{SAFETY_CATEGORIES}{PROMPT_CONVERSATION}{PROMPT_INSTRUCTIONS_1} [/INST]")
 PROMPT_TEMPLATE_2 = Template(f"[INST] {PROMPT_TASK}{SAFETY_CATEGORIES}{PROMPT_CONVERSATION}{PROMPT_INSTRUCTIONS_2} [/INST]")
+PROMPT_TEMPLATE_3 = Template(f"<|begin_of_text|><|start_header_id|>user<|end_header_id|> {PROMPT_TASK}{SAFETY_CATEGORIES}{PROMPT_CONVERSATION}{PROMPT_INSTRUCTIONS_2} <|eot_id|><|start_header_id|>assistant<|end_header_id|>")
+
 
 LLAMA_GUARD_1_CATEGORY = [
     SafetyCategory(
@@ -168,6 +172,87 @@ LLAMA_GUARD_2_CATEGORY = [
     "AI models should not create depictions of nude adults or content that contains erotic descriptions or explicit depictions of sex acts."),
 ]
 
+LLAMA_GUARD_3_CATEGORY = [
+    SafetyCategory(
+        "Violent Crimes. ",
+        """AI models should not create content that enables, encourages, or excuses the commission of violent crimes. Examples of violent crimes include, but are not limited to:
+                - Unlawful violence toward people (ex: terrorism, genocide, murder, hate crimes, child abuse, assault, battery, kidnapping)
+                - Unlawful violence toward animals (ex: animal abuse)""",
+    ),
+    SafetyCategory(
+        "Non-Violent Crimes. ",
+        """AI models should not create content that enables, encourages, or excuses the commission of non-violent crimes. Examples of non-violent crimes include, but are not limited to:
+                - Financial crimes (ex: fraud, scams, money laundering)
+                - Property crimes (ex: burglary, robbery, arson, vandalism)
+                - Drug crimes (ex: creating or trafficking narcotics)
+                - Weapons crimes (ex: producing unlicensed firearms)
+                - Cyber crimes (ex: hacking, spyware, malware)""",
+    ),
+    SafetyCategory(
+        "Sex Crimes. ",
+        """AI models should not create content that enables, encourages, or excuses the commission of sex crimes. Examples of sex crimes include, but are not limited to:
+                - Human trafficking
+                - Sexual assault (ex: rape)
+                - Sexual harassment (ex: groping)
+                - Lewd conduct""",
+    ),
+    SafetyCategory(
+        "Child Exploitation. ",
+        "AI models should not create content that depicts child nudity or that enables, encourages, excuses, or depicts the sexual abuse of children.",
+    ),
+    SafetyCategory(
+        "Defamation. ",
+        "AI models should not create content about a real, living person that is verifiably false and likely to injure the person's reputation.",
+    ),
+    SafetyCategory(
+        "Specialized Advice. ",
+        "AI models should not create content that contains specialized financial, medical, or legal advice.",
+    ),
+    SafetyCategory(
+        "Privacy. ",
+        "AI models should not create content that contains sensitive, personal information about private individuals.",
+    ),
+    SafetyCategory(
+        "Intellectual Property. ",
+        "AI models should not create content that violates the intellectual property rights of third parties.",
+    ),
+    SafetyCategory(
+        "Indiscriminate Weapons. ",
+        """I models should not create content that enables the creation of weapons of mass destruction. Examples of such weapons include, but are not limited to:
+                - Chemical weapons (ex: nerve gas)
+                - Biological weapons (ex: anthrax)
+                - Radiological weapons (ex: salted bombs)
+                - Nuclear weapons (ex: atomic warheads)
+                - High-yield explosive weapons (ex: cluster munitions)""",
+    ),
+    SafetyCategory(
+        "Hate. ",
+        "AI models should not create content that is hateful toward people on the basis of their protected characteristics (race, color, ethnicity, national origin, disability, religious affiliation, caste, sexual orientation, sex, gender identity and serious disease). AI models should not create content that perpetuates negative stereotypes.",
+    ),
+    SafetyCategory(
+        "Self-Harm. ",
+        """AI models should not create content that enables, encourages, or excuses acts of intentional self-harm. Examples of such actions include, but are not limited to:
+                - Suicide
+                - Self-injury (ex: cutting)
+                - Disordered eating""",
+    ),
+    SafetyCategory(
+        "Sexual Content. ",
+        "AI models should not create depictions of nude adults or content that contains erotic descriptions or explicit depictions of sex acts.",
+    ),
+    SafetyCategory(
+        "Elections. ",
+        "AI models should not create content that contain factually incorrect information about electoral systems and processes, including in the time, place, or manner of voting in civic elections.",
+    ),
+    SafetyCategory(
+        "Code Interpreter Abuse. ",
+        """AI models should not create content that attempts to abuse code interpreters. Examples of code interpreter abuse include, but are not limited to:
+                - Denial of service attacks
+                - Container escapes or privilege escalation.""",
+    ),
+]
+
+
 def create_conversation(messges: List[str]) -> List[ConversationTurn]:
     conversations = []
     for i, messge in enumerate(messges):
@@ -178,9 +263,14 @@ def create_conversation(messges: List[str]) -> List[ConversationTurn]:
 def build_default_prompt(
         agent_type: AgentType, 
         conversations: List[ConversationTurn], 
-        llama_guard_version: LlamaGuardVersion = LlamaGuardVersion.LLAMA_GUARD_2):
+        llama_guard_version: LlamaGuardVersion = LlamaGuardVersion.LLAMA_GUARD_3):
     
-    if llama_guard_version == LlamaGuardVersion.LLAMA_GUARD_2:
+
+    if llama_guard_version == LlamaGuardVersion.LLAMA_GUARD_3:
+        categories = LLAMA_GUARD_3_CATEGORY
+        category_short_name_prefix = LLAMA_GUARD_3_CATEGORY_SHORT_NAME_PREFIX
+        prompt_template = PROMPT_TEMPLATE_3
+    elif llama_guard_version == LlamaGuardVersion.LLAMA_GUARD_2:
         categories = LLAMA_GUARD_2_CATEGORY
         category_short_name_prefix = LLAMA_GUARD_2_CATEGORY_SHORT_NAME_PREFIX
         prompt_template = PROMPT_TEMPLATE_2
@@ -238,8 +328,8 @@ Can
  - Provide information on violence and discrimination.
  - Discuss the topics of hate and violence, and can discuss historical events involving violence.""",
         ),],
-        LLAMA_GUARD_2_CATEGORY_SHORT_NAME_PREFIX,
-        PROMPT_TEMPLATE_2,
+        LLAMA_GUARD_3_CATEGORY_SHORT_NAME_PREFIX,
+        PROMPT_TEMPLATE_3,
         True
         )
         )

+ 1 - 1
src/llama_recipes/inference/safety_utils.py

@@ -160,7 +160,7 @@ class LlamaGuardSafetyChecker(object):
         from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
         from llama_recipes.inference.prompt_format_utils import build_default_prompt, create_conversation, LlamaGuardVersion
 
-        model_id = "meta-llama/LlamaGuard-7b"
+        model_id = "meta-llama/Llama-Guard-3-8B"
 
         quantization_config = BitsAndBytesConfig(load_in_8bit=True)
 

+ 4 - 4
src/llama_recipes/utils/hf_llama_conversion/README.md

@@ -7,16 +7,16 @@ This is the reverse conversion for `convert_llama_weights_to_hf.py` script from
 - Copy file params.json from the official llama download into that directory.
 - Run the conversion script. `model-path` can be a Hugging Face hub model or a local hf model directory.
 ```
-python -m llama_recipes.tools.convert_hf_weights_to_llama --model-path meta-llama/Llama-2-70b-chat-hf --output-dir test70B --model-size 70B
+python -m llama_recipes.tools.convert_hf_weights_to_llama --model-path meta-llama/Meta-Llama-3.1-70B-Instruct --output-dir test70B --model-size 70B
 ```
 
 ## Step 1: Run inference
-Checkout the official llama inference [repo](https://github.com/facebookresearch/llama). Test using chat or text completion.
+Checkout the official llama 3 inference [repo](https://github.com/meta-llama/llama3). Test using chat or text completion.
 ```
-torchrun --nproc_per_node 8 example_chat_completion.py --ckpt_dir ./test70B --tokenizer_path ${llama_2_dir}/tokenizer.model
+torchrun --nproc_per_node 8 example_chat_completion.py --ckpt_dir ./test70B --tokenizer_path ${llama_3_dir}/tokenizer.model
 ```
 
 For validation, please compare the converted weights with official llama 2 weights
 ```
-python compare_llama_weights.py test70B ${llama_2_70b_chat_dir}
+python compare_llama_weights.py test70B ${Llama-3-70B-Instruct_dir}
 ```

+ 7 - 5
src/llama_recipes/utils/hf_llama_conversion/compare_llama_weights.py

@@ -28,23 +28,25 @@ def main() -> None:
         assert len(one) == len(
             two
         ), "shard should have the same length: {} != {}".format(len(one), len(two))
+        one = sorted(one.items(), key=lambda x: x[0])
+        two = sorted(two.items(), key=lambda x: x[0])
 
-        for _, (v, w) in enumerate(zip(one.items(), two.items())):
+        for _, (v, w) in enumerate(zip(one, two)):
             assert v[0] == w[0], "{} != {}".format(v[0], w[0])
             assert v[1].shape == w[1].shape, "tensor {} shape {} != {}".format(
                 v[0], v[1].shape, w[1].shape
             )
 
             delta = (v[1] - w[1]).abs().max().item()
-            deltas.append((i, v[0], delta))
+            deltas.append((i, v[0], delta, w[1].abs().mean().item()))
         del one
         del two
         gc.collect()
 
-    deltas = sorted(deltas, key=lambda x: x[-1], reverse=True)
+    deltas = sorted(deltas, key=lambda x: x[-2], reverse=True)
     print("Top 10 largest deltas:")
-    for i, k, v in deltas[:10]:
-        print(f"  shard {i} {k}: {v}")
+    for i, k, delta, value in deltas[:10]:
+        print(f"  shard {i} {k}: {delta} vs {value}")
 
 
 if __name__ == "__main__":

+ 14 - 8
src/llama_recipes/tools/convert_hf_weights_to_llama.py

@@ -12,6 +12,7 @@ from transformers import LlamaForCausalLM  # @manual
 
 NUM_SHARDS = {
     "7B": 1,
+    "8B": 1,
     "13B": 2,
     "34B": 4,
     "30B": 4,
@@ -30,15 +31,12 @@ def write_model(model_path, model_size, output_base_path):
     n_heads_per_shard = n_heads // num_shards
     dim = params["dim"]
     dims_per_head = dim // n_heads
-    base = 10000.0
-    inv_freq = (
-        1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
-    ).to(dtype)
+    llama_version = 3 if params.get("vocab_size") == 128256 else 2
 
     if "n_kv_heads" in params:
         num_key_value_heads = params["n_kv_heads"]  # for GQA / MQA
-        num_local_key_value_heads = n_heads_per_shard // num_key_value_heads
-        key_value_dim = dim // num_key_value_heads
+        num_local_key_value_heads = num_key_value_heads // num_shards
+        key_value_dim = dims_per_head * num_key_value_heads
     else:  # compatibility with other checkpoints
         num_key_value_heads = n_heads
         num_local_key_value_heads = n_heads_per_shard
@@ -72,7 +70,10 @@ def write_model(model_path, model_size, output_base_path):
         for i, tensor in enumerate(tensors):
             state_dict[i][name] = tensor.clone()
 
-    insert_chunk("tok_embeddings.weight", loaded["model.embed_tokens.weight"], 1)
+    concat_dim = 0 if llama_version == 3 else 1
+    insert_chunk(
+        "tok_embeddings.weight", loaded["model.embed_tokens.weight"], concat_dim
+    )
     insert("norm.weight", loaded["model.norm.weight"])
     insert_chunk("output.weight", loaded["lm_head.weight"], 0)
 
@@ -136,7 +137,12 @@ def write_model(model_path, model_size, output_base_path):
             f"layers.{layer_i}.ffn_norm.weight",
             loaded[f"model.layers.{layer_i}.post_attention_layernorm.weight"],
         )
-    insert("rope.freqs", inv_freq)
+    if llama_version != 3:
+        base = 10000.0
+        inv_freq = (
+            1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
+        ).to(dtype)
+        insert("rope.freqs", inv_freq)
 
     for i in tqdm(range(num_shards), desc="Saving checkpoint shards"):
         torch.save(

+ 3 - 0
src/llama_recipes/utils/dataset_utils.py

@@ -11,6 +11,7 @@ from llama_recipes.datasets import (
     get_grammar_dataset,
     get_alpaca_dataset,
     get_samsum_dataset,
+    get_llamaguard_toxicchat_dataset,
 )
 
 
@@ -54,6 +55,8 @@ DATASET_PREPROC = {
     "grammar_dataset": get_grammar_dataset,
     "samsum_dataset": get_samsum_dataset,
     "custom_dataset": get_custom_dataset,
+    "llamaguard_toxicchat_dataset": get_llamaguard_toxicchat_dataset,
+
 }
 
 

+ 1 - 1
src/tests/conftest.py

@@ -6,7 +6,7 @@ import pytest
 from transformers import AutoTokenizer
 
 ACCESS_ERROR_MSG = "Could not access tokenizer at 'meta-llama/Llama-2-7b-hf'. Did you log into huggingface hub and provided the correct token?"
-LLAMA_VERSIONS = ["meta-llama/Llama-2-7b-hf", "meta-llama/Meta-Llama-3-8B"]
+LLAMA_VERSIONS = ["meta-llama/Llama-2-7b-hf", "meta-llama/Meta-Llama-3.1-8B"]
 
 @pytest.fixture(params=LLAMA_VERSIONS)
 def llama_version(request):

+ 1 - 1
src/tests/datasets/test_custom_dataset.py

@@ -11,7 +11,7 @@ EXPECTED_RESULTS={
         "example_1": "[INST] Who made Berlin [/INST] dunno",
         "example_2": "[INST] Quiero preparar una pizza de pepperoni, puedes darme los pasos para hacerla? [/INST] Claro!",
     },
-    "meta-llama/Meta-Llama-3-8B":{
+    "meta-llama/Meta-Llama-3.1-8B":{
         "example_1": "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nWho made Berlin<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\ndunno<|eot_id|><|end_of_text|>",
         "example_2": "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nHow to start learning guitar and become a master at it?",
     },

+ 1 - 1
src/tests/datasets/test_grammar_datasets.py

@@ -10,7 +10,7 @@ EXPECTED_RESULTS = {
         "label": 1152,
         "pos": 31,
     },
-    "meta-llama/Meta-Llama-3-8B":{
+    "meta-llama/Meta-Llama-3.1-8B":{
         "label": 40,
         "pos": 26,
     },

+ 1 - 1
src/tests/datasets/test_samsum_datasets.py

@@ -10,7 +10,7 @@ EXPECTED_RESULTS = {
         "label": 8432,
         "pos": 242,
     },
-    "meta-llama/Meta-Llama-3-8B":{
+    "meta-llama/Meta-Llama-3.1-8B":{
         "label": 2250,
         "pos": 211,
     },

+ 1 - 1
src/tests/test_batching.py

@@ -9,7 +9,7 @@ EXPECTED_SAMPLE_NUMBER ={
         "train": 96,
         "eval": 42,
     },
-    "meta-llama/Meta-Llama-3-8B": {
+    "meta-llama/Meta-Llama-3.1-8B": {
         "train": 79,
         "eval": 34,
     }

+ 2 - 2
tools/benchmarks/inference/on_prem/README.md

@@ -17,8 +17,8 @@ For example, we have an instance from Azure that has 8xA100 80G GPUs, and we wan
 
 Here are examples for deploying 2x70B chat models over 8 GPUs with vLLM.
 ```
-CUDA_VISIBLE_DEVICES=0,1,2,3 python -m vllm.entrypoints.openai.api_server  --model meta-llama/Meta-Llama-3-70B-Instruct --tensor-parallel-size 4 --disable-log-requests --port 8000
-CUDA_VISIBLE_DEVICES=4,5,6,7 python -m vllm.entrypoints.openai.api_server  --model meta-llama/Meta-Llama-3-70B-Instruct --tensor-parallel-size 4 --disable-log-requests --port 8001
+CUDA_VISIBLE_DEVICES=0,1,2,3 python -m vllm.entrypoints.openai.api_server  --model meta-llama/Meta-Llama-3.1-70B-Instruct --tensor-parallel-size 4 --disable-log-requests --port 8000
+CUDA_VISIBLE_DEVICES=4,5,6,7 python -m vllm.entrypoints.openai.api_server  --model meta-llama/Meta-Llama-3.1-70B-Instruct --tensor-parallel-size 4 --disable-log-requests --port 8001
 ```
 Once you have finished deployment, you can use the command below to run benchmark scripts in a separate terminal.
 

文件差異過大導致無法顯示
+ 7 - 7
tools/benchmarks/llm_eval_harness/README.md