2 tahun lalu · 7367f7eae6
--- a/recipes/finetuning/datasets/chatbot_dataset.py
+++ b/recipes/finetuning/datasets/chatbot_dataset.py
@@ -11,24 +11,22 @@ import itertools
 
				 B_INST, E_INST = "[INST]", "[/INST]"
			
 
				 
			
 
				 def tokenize_dialog(q_a_pair, tokenizer):
			
 
				-    prompt_tokens = [tokenizer.encode(f"{tokenizer.bos_token}{B_INST} {(question).strip()} {E_INST}", add_special_tokens=False) for question in q_a_pair["Question"]]
			
 
				-    answer_tokens = [tokenizer.encode(f"{answer.strip()} {tokenizer.eos_token}", add_special_tokens=False) for answer in q_a_pair["Answer"]]
			
 
				-    dialog_tokens = list(itertools.chain.from_iterable(zip(prompt_tokens, answer_tokens)))
			
 
				-    dialog_tokens = list(itertools.chain.from_iterable(zip(prompt_tokens, answer_tokens)))
			
 
				-    #Add labels, convert prompt token to -100 in order to ignore in loss function
			
 
				-    labels_tokens = [len(c)*[-100,] if i % 2 == 0 else c for i,c in enumerate(dialog_tokens)]
			
 
				+    question, answer = q_a_pair["Question"], q_a_pair["Answer"]
			
 
				+    prompt_tokens = tokenizer.encode(f"{tokenizer.bos_token}{B_INST} {(question).strip()} {E_INST}", add_special_tokens=False)
			
 
				+    answer_tokens = tokenizer.encode(f"{answer.strip()} {tokenizer.eos_token}", add_special_tokens=False)
			
 
				+    sample = {
			
 
				+            "input_ids": prompt_tokens + answer_tokens,
			
 
				+            "attention_mask" : [1] * (len(prompt_tokens) + len(answer_tokens)),
			
 
				+            "labels": [-100] * len(prompt_tokens) + answer_tokens,
			
 
				+            }
			
 
				 
			
 
				-    combined_tokens = {
			
 
				-        "input_ids": list(itertools.chain(*(t for t in dialog_tokens))),
			
 
				-        "labels": list(itertools.chain(*(t for t in labels_tokens))),
			
 
				-    }
			
 
				-
			
 
				-    return dict(combined_tokens, attention_mask=[1]*len(combined_tokens["input_ids"]))
			
 
				+    return sample
			
 
				 
			
 
				 
			
 
				 def get_custom_dataset(dataset_config, tokenizer, split, split_ratio=0.8):
			
 
				-    dataset = load_dataset('json', data_files=dataset_config.data_path)
			
 
				-    dataset = dataset['train'].train_test_split(test_size=1-split_ratio, shuffle=True)
			
 
				+    dataset_dict = load_dataset('json', data_files=dataset_config.data_path)
			
 
				+    dataset = dataset_dict['train']
			
 
				+    dataset = dataset.train_test_split(test_size=1-split_ratio, shuffle=True, seed=42)
			
 
				 
			
 
				     dataset = dataset[split].map(lambda sample: {
			
 
				         "Question": sample["Question"],
			
--- a/recipes/finetuning/datasets/raft_dataset.py
+++ b/recipes/finetuning/datasets/raft_dataset.py
@@ -0,0 +1,55 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				+# This software may be used and distributed according to the terms of the Llama 3 Community License Agreement.
			
 
				+
			
 
				+
			
 
				+import copy
			
 
				+import datasets
			
 
				+from datasets import Dataset, load_dataset, DatasetDict
			
 
				+import itertools
			
 
				+
			
 
				+
			
 
				+B_INST, E_INST = "[INST]", "[/INST]"
			
 
				+
			
 
				+def raft_tokenize(q_a_pair, tokenizer):
			
 
				+    # last line is the question
			
 
				+    question = q_a_pair["instruction"].split('\n')[-1]
			
 
				+    # all the lines before the last line are the context
			
 
				+    documents = q_a_pair["instruction"].split('\n')[:-1]
			
 
				+    # output is the label
			
 
				+    answer = q_a_pair["output"]
			
 
				+    system_prompt = "You are a helpful question answerer who can provide an answer given a question and relevant context."
			
 
				+    user_prompt = prompt = """
			
 
				+        Question: {question}\nContext: {context}\n
			
 
				+        Answer this question using the information given in the context above. Here is things to pay attention to:
			
 
				+        - First provide step-by-step reasoning on how to answer the question.
			
 
				+        - In the reasoning, if you need to copy paste some sentences from the context, include them in ##begin_quote## and ##end_quote##. This would mean that things outside of ##begin_quote## and ##end_quote## are not directly copy paste from the context.
			
 
				+        - End your response with final answer in the form <ANSWER>: $answer, the answer should be succinct.
			
 
				+        You MUST begin your final answer with the tag "<ANSWER>:".
			
 
				+    """.format(question=question, context=str(documents))
			
 
				+    final_prompt = system_prompt + '\n' + user_prompt
			
 
				+    prompt_tokens = tokenizer.encode(f"{tokenizer.bos_token}{B_INST} {(final_prompt).strip()} {E_INST}", add_special_tokens=False)
			
 
				+    answer_tokens = tokenizer.encode(f"{answer.strip()} {tokenizer.eos_token}", add_special_tokens=False)
			
 
				+    #Add labels, convert prompt token to -100 in order to ignore in loss function
			
 
				+    sample = {
			
 
				+            "input_ids": prompt_tokens + answer_tokens,
			
 
				+            "attention_mask" : [1] * (len(prompt_tokens) + len(answer_tokens)),
			
 
				+            "labels": [-100] * len(prompt_tokens) + answer_tokens,
			
 
				+            }
			
 
				+
			
 
				+    return sample
			
 
				+
			
 
				+
			
 
				+def get_custom_dataset(dataset_config, tokenizer, split, split_ratio=0.8):
			
 
				+    # load_dataset will return DatasetDict that contains all the data in the train set
			
 
				+    dataset_dict = load_dataset('json', data_files=dataset_config.data_path)
			
 
				+    dataset = dataset_dict['train']
			
 
				+    dataset = dataset.train_test_split(test_size=1-split_ratio, shuffle=True, seed=42)
			
 
				+
			
 
				+    dataset = dataset[split].map(lambda sample: {
			
 
				+        "instruction": sample["instruction"],
			
 
				+        "output": sample["cot_answer"],
			
 
				+        },
			
 
				+        batched=True,
			
 
				+    )
			
 
				+    dataset = dataset.map(lambda x: raft_tokenize(x, tokenizer))
			
 
				+    return dataset
			
--- a/recipes/use_cases/end2end-recipes/raft/README.md
+++ b/recipes/use_cases/end2end-recipes/raft/README.md
--- a/recipes/use_cases/end2end-recipes/raft/data/FAQ.md
+++ b/recipes/use_cases/end2end-recipes/raft/data/FAQ.md
@@ -1,55 +0,0 @@
 
				-# FAQ
			
 
				-
			
 
				-Here we discuss frequently asked questions that may occur and we found useful along the way.
			
 
				-
			
 
				-1. Does FSDP support mixed precision in one FSDP unit? Meaning, in one FSDP unit some of the parameters are in Fp16/Bf16 and others in FP32.
			
 
				-
			
 
				-    FSDP requires each FSDP unit to have consistent precision, so this case is not supported at this point. It might be added in future but no ETA at the moment.
			
 
				-
			
 
				-2.  How does FSDP handles mixed grad requirements?
			
 
				-
			
 
				-    FSDP does not support mixed `require_grad` in one FSDP unit. This means if you are planning to freeze some layers, you need to do it on the FSDP unit level rather than model layer. For example, let us assume our model has 30 decoder layers and we want to freeze the bottom 28 layers and only train 2 top transformer layers. In this case, we need to make sure `require_grad` for the top two transformer layers are set to `True`.
			
 
				-
			
 
				-3. How do PEFT methods work with FSDP in terms of grad requirements/layer freezing?
			
 
				-
			
 
				-    We wrap the PEFT modules separate from the transformer layer in auto_wrapping policy, that would result in PEFT models having `require_grad=True` while the rest of the model is  `require_grad=False`.
			
 
				-
			
 
				-4. Can I add custom datasets?
			
 
				-
			
 
				-    Yes, you can find more information on how to do that [here](Dataset.md).
			
 
				-
			
 
				-5. What are the hardware SKU requirements for deploying these models?
			
 
				-
			
 
				-    Hardware requirements vary based on latency, throughput and cost constraints. For good latency, the models were split across multiple GPUs with tensor parallelism in a machine with NVIDIA A100s or H100s. But TPUs, other types of GPUs like A10G, T4, L4, or even commodity hardware can also be used to deploy these models (e.g. https://github.com/ggerganov/llama.cpp).
			
 
				-    If working on a CPU, it is worth looking at this [blog post](https://www.intel.com/content/www/us/en/developer/articles/news/llama2.html) from Intel for an idea of Llama 2's performance on a CPU.
			
 
				-
			
 
				-6. What are the hardware SKU requirements for fine-tuning Llama pre-trained models?
			
 
				-
			
 
				-    Fine-tuning requirements vary based on amount of data, time to complete fine-tuning and cost constraints. To fine-tune these models we have generally used multiple NVIDIA A100 machines with data parallelism across nodes and a mix of data and tensor parallelism intra node. But using a single machine, or other GPU types like NVIDIA A10G or H100 are definitely possible (e.g. alpaca models are trained on a single RTX4090: https://github.com/tloen/alpaca-lora).
			
 
				-
			
 
				-7. How to handle CUDA memory fragmentations during fine-tuning that may lead into an OOM?
			
 
				-
			
 
				-    In some cases you may experience that after model checkpointing specially with FSDP (this usually does not happen with PEFT methods), the reserved and allocated CUDA memory has increased. This might be due to CUDA memory fragmentations. PyTorch recenly added an enviroment variable that helps to better manage memory fragmentation (this feature in available on PyTorch nightlies at the time of writing this doc July 30 2023). You can set this in your main training script as follows:
			
 
				-
			
 
				-    ```bash
			
 
				-
			
 
				-    os.environ['PYTORCH_CUDA_ALLOC_CONF']='expandable_segments:True'
			
 
				-
			
 
				-    ```
			
 
				-    We also added this enviroment variable in `setup_environ_flags` of the [train_utils.py](../src/llama_recipes/utils/train_utils.py), feel free to uncomment it if required.
			
 
				-
			
 
				-8. Additional debugging flags?
			
 
				-
			
 
				-    The environment variable `TORCH_DISTRIBUTED_DEBUG` can be used to trigger additional useful logging and collective synchronization checks to ensure all ranks are synchronized appropriately. `TORCH_DISTRIBUTED_DEBUG` can be set to either OFF (default), INFO, or DETAIL depending on the debugging level required. Please note that the most verbose option, DETAIL may impact the application performance and thus should only be used when debugging issues.
			
 
				-
			
 
				-    We also added this enviroment variable in `setup_environ_flags` of the [train_utils.py](../src/llama_recipes/utils/train_utils.py), feel free to uncomment it if required.
			
 
				-
			
 
				-9. I am getting import errors when running inference.
			
 
				-
			
 
				-    Verify that CUDA environment variables are set correctly on your machine. For example for bitsandbytes, you can generally set it as below to get things working on A100 80g's on AWS.
			
 
				-
			
 
				-    ```bash
			
 
				-    export CUDA_HOME="/usr/local/cuda-11.8"
			
 
				-    export PATH=$CUDA_HOME/bin:$PATH
			
 
				-    export LD_LIBRARY_PATH=$CUDA_HOME/lib:$CUDA_HOME/lib64:$CUDA_HOME/efa/lib:/opt/amazon/efa/lib:$LD_LIBRARY_PATH
			
 
				-    ```
			
--- a/recipes/use_cases/end2end-recipes/raft/eval_config.yaml
+++ b/recipes/use_cases/end2end-recipes/raft/eval_config.yaml
@@ -0,0 +1,23 @@
 
				+eval_prompt_template: >
			
 
				+  You are a AI assistant that skilled in answering questions related to Llama language models,
			
 
				+  which includes LLama, Llama2, Meta Llama3, Code Llama, Meta Llama Guard 1,	Meta Llama Guard 2,
			
 
				+  Below is a question from a llama user, think step by step and then answer it in {language}, make the answer as concise as possible, it should be at most 100 words.
			
 
				+  Return the result with the template:
			
 
				+  [
			
 
				+    {{
			
 
				+      "Question": "The question user asked to you"
			
 
				+      "Answer": "Your answer to the question"
			
 
				+  }}
			
 
				+  ]
			
 
				+judge_prompt_template: >
			
 
				+  You are provided with a question, a teacher's answer and a student's answer. Given that question, you need to score the how good the student answer is compare to
			
 
				+  the teacher's answer. If the student's answer is correct based on the teacher's answer, then return YES. If the answer is not faithful, then return NO
			
 
				+  and explain which part of the student's answer is not faithful in the Reason section.
			
 
				+  Return the result in json format with the template:
			
 
				+    {{
			
 
				+      "Reason": "your reason here.",
			
 
				+      "Result": "YES or NO."
			
 
				+    }}
			
 
				+eval_json: "./evalset.json"
			
 
				+
			
 
				+language: "English"
			
--- a/recipes/use_cases/end2end-recipes/raft/eval_raft.py
+++ b/recipes/use_cases/end2end-recipes/raft/eval_raft.py
@@ -0,0 +1,219 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				+# This software may be used and distributed according to the terms of the Llama 3 Community License Agreement.
			
 
				+from chat_utils import OctoAIChatService, VllmChatService
			
 
				+import logging
			
 
				+import evaluate
			
 
				+import argparse
			
 
				+from config import load_config
			
 
				+import asyncio
			
 
				+import json
			
 
				+from itertools import chain
			
 
				+from generator_utils import parse_qa_to_json, generate_LLM_eval
			
 
				+from langchain_community.llms import VLLM
			
 
				+from langchain_community.embeddings import HuggingFaceEmbeddings
			
 
				+from langchain_community.vectorstores import FAISS
			
 
				+from langchain.text_splitter import RecursiveCharacterTextSplitter
			
 
				+from langchain_community.document_loaders import DirectoryLoader
			
 
				+from langchain.chains import RetrievalQA
			
 
				+
			
 
				+from eval_utils import exact_match_score
			
 
				+def generate_answers_model_only(model_path):
			
 
				+        # Use langchain to load the documents from data directory
			
 
				+    # Load the RAFT model
			
 
				+    llm = VLLM(model=model_path,
			
 
				+           trust_remote_code=True,  # mandatory for hf models
			
 
				+           max_new_tokens=500,
			
 
				+           top_p=1,
			
 
				+           temperature=0.0,
			
 
				+           # tensor_parallel_size=... # for distributed inference
			
 
				+        )
			
 
				+    generated_answers = []
			
 
				+    for question in question_list:
			
 
				+        result = llm.invoke(question)
			
 
				+        generated_answers.append(result["answer"])
			
 
				+    return generated_answers
			
 
				+def generate_answers_with_RAG(model_path, data_dir,question_list):
			
 
				+    # Use langchain to load the documents from data directory
			
 
				+    loader = DirectoryLoader(data_dir)
			
 
				+    docs = loader.load()
			
 
				+    # Split the document into chunks with a specified chunk size
			
 
				+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
			
 
				+    all_splits = text_splitter.split_documents(docs)
			
 
				+
			
 
				+    # Store the document into a vector store with a specific embedding model
			
 
				+    vectorstore = FAISS.from_documents(all_splits, HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2"))
			
 
				+    # Load the RAFT model
			
 
				+    llm = VLLM(model=model_path,
			
 
				+           trust_remote_code=True,  # mandatory for hf models
			
 
				+           max_new_tokens=500,
			
 
				+           top_p=1,
			
 
				+           temperature=0.0,
			
 
				+           # tensor_parallel_size=... # for distributed inference
			
 
				+        )
			
 
				+    # Create a RetrievalQA chain with the vector store and RAFT model
			
 
				+    qa_chain = RetrievalQA.from_chain_type(
			
 
				+    llm,
			
 
				+    retriever=vectorstore.as_retriever()
			
 
				+    )
			
 
				+    generated_answers = []
			
 
				+    for question in question_list:
			
 
				+        result = qa_chain({"query": question})
			
 
				+        generated_answers.append(result["answer"])
			
 
				+    return generated_answers
			
 
				+def compute_rouge_score(generated : str, reference: str):
			
 
				+    rouge_score = evaluate.load('rouge')
			
 
				+    return rouge_score.compute(
			
 
				+        predictions=generated,
			
 
				+        references=reference,
			
 
				+        use_stemmer=True,
			
 
				+        use_aggregator=True
			
 
				+    )
			
 
				+def compute_bert_score(generated : str, reference: str):
			
 
				+    bertscore = evaluate.load("bertscore")
			
 
				+    score = bertscore.compute(
			
 
				+        predictions=generated,
			
 
				+        references=reference,
			
 
				+        lang="en"
			
 
				+    )
			
 
				+    f1 = score["f1"]
			
 
				+    precision = score["precision"]
			
 
				+    recall = score["recall"]
			
 
				+    return sum(precision)/len(precision), sum(recall)/len(recall), sum(f1)/len(f1)
			
 
				+# This function is used to eval the fine-tuned model, given the question, generate the answer.
			
 
				+async def eval_request(chat_service, api_context: dict, question: str) -> dict:
			
 
				+    prompt_for_system = api_context['eval_prompt_template'].format(language=api_context["language"])
			
 
				+    chat_request_payload = [{'role': 'system', 'content': prompt_for_system}, {'role': 'user', 'content': f"Question: {question}"}]
			
 
				+    # Getting a list of result, in this case, there should be only one result
			
 
				+    response_string = await chat_service.execute_chat_request_async(api_context, chat_request_payload)
			
 
				+    # convert the result string to a dict that contains Question, Answer
			
 
				+    result_list = parse_qa_to_json(response_string)
			
 
				+    if not result_list or len(result_list) > 1:
			
 
				+        print("Error: eval response should be a list of one result dict")
			
 
				+        return {}
			
 
				+    result = result_list[0]
			
 
				+    if "Answer" not in result:
			
 
				+        print("Error: eval response does not contain answer")
			
 
				+        return {}
			
 
				+    # Send back the model generated answer
			
 
				+
			
 
				+    return result["Answer"]
			
 
				+
			
 
				+async def generate_eval_answer(chat_service, api_context: dict, questions: list):
			
 
				+    eval_tasks = []
			
 
				+    for batch_index, question in enumerate(questions):
			
 
				+        try:
			
 
				+            result = eval_request(chat_service, api_context, question)
			
 
				+            eval_tasks.append(result)
			
 
				+        except Exception as e:
			
 
				+            print(f"Error during data eval request execution: {e}")
			
 
				+    print(len(eval_tasks),"eval_tasks")
			
 
				+    eval_results = await asyncio.gather(*eval_tasks)
			
 
				+
			
 
				+    return eval_results
			
 
				+
			
 
				+async def main(context):
			
 
				+    if context["endpoint"]:
			
 
				+        chat_service = VllmChatService()
			
 
				+    else:
			
 
				+        chat_service = OctoAIChatService()
			
 
				+    try:
			
 
				+        logging.info("Starting to generate answer given the eval set.")
			
 
				+        with open(context["eval_json"]) as fp:
			
 
				+            eval_json = json.load(fp)
			
 
				+        questions,groud_truth = [],[]
			
 
				+        for index, item in enumerate(eval_json):
			
 
				+            questions.append(item["question"])
			
 
				+            groud_truth.append(item["answer"])
			
 
				+        generated_answers = generate_answers_with_RAG(model_path, context,questions)
			
 
				+        if not generated_answers:
			
 
				+            logging.warning("No answers generated. Please check the input context or model configuration.")
			
 
				+            return
			
 
				+        logging.info(f"Successfully generated {len(generated_answers)} answers.")
			
 
				+        judge_list = []
			
 
				+        for index, item in enumerate(generated_answers):
			
 
				+            judge_list.append({"Question":questions[index],"Ground_truth":groud_truth[index],"Generated_answer":generated_answers[index]})
			
 
				+        if context["judge_endpoint"]:
			
 
				+            # make a copy of the context then change the VLLM endpoint to judge_endpoint
			
 
				+            context_copy = dict(context)
			
 
				+            context_copy["endpoint"] = context["judge_endpoint"]
			
 
				+            context_copy["model"] = "meta-llama/Meta-Llama-3-70B-Instruct"
			
 
				+            judge_results = await generate_LLM_eval(chat_service, context_copy, judge_list)
			
 
				+            correct_num = 0
			
 
				+            for result in judge_results:
			
 
				+                correct_num += result["Result"] == "YES"
			
 
				+            LLM_judge_score = correct_num/len(judge_results)
			
 
				+            print(f"The accuracy of the model is {LLM_judge_score}")
			
 
				+        rouge_score = compute_rouge_score(generated_answers,groud_truth)
			
 
				+        print("Rouge_score:",rouge_score)
			
 
				+        P, R, F1 = compute_bert_score(generated_answers,groud_truth)
			
 
				+        print(f"BERTScore Precision: {P:.4f}, Recall: {R:.4f}, F1: {F1:.4f}")
			
 
				+        exact_match = 0
			
 
				+        for item in judge_list:
			
 
				+            exact_match += exact_match_score(item['Generated_answer'],item['Ground_truth'])
			
 
				+        exact_match_percentage = exact_match/len(judge_list)
			
 
				+        print(f"Exact_match_percentage: {exact_match_percentage:.4f}")
			
 
				+        # Saving the eval result to a log file
			
 
				+        with open(context["output_log"],"a") as fp:
			
 
				+            fp.write(f"Eval_result for {context['model']} \n")
			
 
				+            fp.write(f"Rouge_score: {rouge_score} \n")
			
 
				+            fp.write(f"BERTScore Precision: {P:.4f}, Recall: {R:.4f}, F1: {F1:.4f} \n")
			
 
				+            fp.write(f"Exact_match_percentage: {exact_match_percentage} \n")
			
 
				+            if context["judge_endpoint"]:
			
 
				+                fp.write(f"LLM_judge_score: {LLM_judge_score} \n")
			
 
				+            fp.write(f"QA details: \n")
			
 
				+            for item in judge_list:
			
 
				+                fp.write(f"question: {item['Question']} \n")
			
 
				+                fp.write(f"generated_answers: {item['Generated_answer']} \n")
			
 
				+                fp.write(f"groud_truth: {item['Ground_truth']} \n")
			
 
				+                fp.write("\n")
			
 
				+        logging.info(f"Eval successfully, the eval result is saved to {context['output_log']}.")
			
 
				+    except Exception as e:
			
 
				+        logging.error(f"An unexpected error occurred during the process: {e}",exc_info=True)
			
 
				+
			
 
				+def parse_arguments():
			
 
				+    # Define command line arguments for the script
			
 
				+    parser = argparse.ArgumentParser(
			
 
				+        description="Generate question/answer pairs from documentation."
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "-m", "--model",
			
 
				+        default="chatbot",
			
 
				+        help="Select the model to use for evaluation, this maybe a LoRA adapter."
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "-c", "--config_path",
			
 
				+        default="eval_config.yaml",
			
 
				+        help="Set the configuration file path that has system prompt along with language, evalset path."
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "-v", "--vllm_endpoint",
			
 
				+        default=None,
			
 
				+        type=int,
			
 
				+        help="If a port is specified, then use local vllm endpoint for evaluations."
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "-j", "--judge_endpoint",
			
 
				+        default=None,
			
 
				+        type=int,
			
 
				+        help="If a port is specified, then use local vllm endpoint as judge LLM."
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "-o", "--output_log",
			
 
				+        default="eval_result.log",
			
 
				+        help="save the eval result to a log file. Default is eval_result.log"
			
 
				+    )
			
 
				+    return parser.parse_args()
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    logging.info("Initializing the process and loading configuration...")
			
 
				+    args = parse_arguments()
			
 
				+    context = load_config(args.config_path)
			
 
				+    context["model"] = args.model
			
 
				+    context["endpoint"] = args.vllm_endpoint
			
 
				+    context["judge_endpoint"] = args.judge_endpoint
			
 
				+    context["output_log"] = args.output_log
			
 
				+    if context["endpoint"]:
			
 
				+        logging.info(f"Use local vllm service for eval at port: '{args.vllm_endpoint}'.")
			
 
				+    if context["judge_endpoint"]:
			
 
				+        logging.info(f"Use local vllm service for judge at port: '{args.judge_endpoint}'.")
			
 
				+    asyncio.run(main(context))
			
--- a/recipes/use_cases/end2end-recipes/raft/eval_utils.py
+++ b/recipes/use_cases/end2end-recipes/raft/eval_utils.py
@@ -0,0 +1,122 @@
 
				+import sys
			
 
				+import ujson as json
			
 
				+import re
			
 
				+import string
			
 
				+from collections import Counter
			
 
				+import pickle
			
 
				+
			
 
				+def normalize_answer(s):
			
 
				+
			
 
				+    def remove_articles(text):
			
 
				+        return re.sub(r'\b(a|an|the)\b', ' ', text)
			
 
				+
			
 
				+    def white_space_fix(text):
			
 
				+        return ' '.join(text.split())
			
 
				+
			
 
				+    def remove_punc(text):
			
 
				+        exclude = set(string.punctuation)
			
 
				+        return ''.join(ch for ch in text if ch not in exclude)
			
 
				+
			
 
				+    def lower(text):
			
 
				+        return text.lower()
			
 
				+
			
 
				+    return white_space_fix(remove_articles(remove_punc(lower(s))))
			
 
				+
			
 
				+
			
 
				+def f1_score(prediction, ground_truth):
			
 
				+    normalized_prediction = normalize_answer(prediction)
			
 
				+    normalized_ground_truth = normalize_answer(ground_truth)
			
 
				+
			
 
				+    ZERO_METRIC = (0, 0, 0)
			
 
				+
			
 
				+    if normalized_prediction in ['yes', 'no', 'noanswer'] and normalized_prediction != normalized_ground_truth:
			
 
				+        return ZERO_METRIC
			
 
				+    if normalized_ground_truth in ['yes', 'no', 'noanswer'] and normalized_prediction != normalized_ground_truth:
			
 
				+        return ZERO_METRIC
			
 
				+
			
 
				+    prediction_tokens = normalized_prediction.split()
			
 
				+    ground_truth_tokens = normalized_ground_truth.split()
			
 
				+    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
			
 
				+    num_same = sum(common.values())
			
 
				+    if num_same == 0:
			
 
				+        return ZERO_METRIC
			
 
				+    precision = 1.0 * num_same / len(prediction_tokens)
			
 
				+    recall = 1.0 * num_same / len(ground_truth_tokens)
			
 
				+    f1 = (2 * precision * recall) / (precision + recall)
			
 
				+    return f1, precision, recall
			
 
				+
			
 
				+
			
 
				+def exact_match_score(prediction, ground_truth):
			
 
				+    return (normalize_answer(prediction) == normalize_answer(ground_truth))
			
 
				+
			
 
				+def update_answer(metrics, prediction, gold):
			
 
				+    em = exact_match_score(prediction, gold)
			
 
				+    f1, prec, recall = f1_score(prediction, gold)
			
 
				+    metrics['em'] += float(em)
			
 
				+    metrics['f1'] += f1
			
 
				+    metrics['prec'] += prec
			
 
				+    metrics['recall'] += recall
			
 
				+    return em, prec, recall
			
 
				+
			
 
				+def update_sp(metrics, prediction, gold):
			
 
				+    cur_sp_pred = set(map(tuple, prediction))
			
 
				+    gold_sp_pred = set(map(tuple, gold))
			
 
				+    tp, fp, fn = 0, 0, 0
			
 
				+    for e in cur_sp_pred:
			
 
				+        if e in gold_sp_pred:
			
 
				+            tp += 1
			
 
				+        else:
			
 
				+            fp += 1
			
 
				+    for e in gold_sp_pred:
			
 
				+        if e not in cur_sp_pred:
			
 
				+            fn += 1
			
 
				+    prec = 1.0 * tp / (tp + fp) if tp + fp > 0 else 0.0
			
 
				+    recall = 1.0 * tp / (tp + fn) if tp + fn > 0 else 0.0
			
 
				+    f1 = 2 * prec * recall / (prec + recall) if prec + recall > 0 else 0.0
			
 
				+    em = 1.0 if fp + fn == 0 else 0.0
			
 
				+    metrics['sp_em'] += em
			
 
				+    metrics['sp_f1'] += f1
			
 
				+    metrics['sp_prec'] += prec
			
 
				+    metrics['sp_recall'] += recall
			
 
				+    return em, prec, recall
			
 
				+
			
 
				+def eval(prediction, gold):
			
 
				+
			
 
				+    metrics = {'em': 0, 'f1': 0, 'prec': 0, 'recall': 0,
			
 
				+        'sp_em': 0, 'sp_f1': 0, 'sp_prec': 0, 'sp_recall': 0,
			
 
				+        'joint_em': 0, 'joint_f1': 0, 'joint_prec': 0, 'joint_recall': 0}
			
 
				+    for dp in gold:
			
 
				+        cur_id = dp['_id']
			
 
				+        can_eval_joint = True
			
 
				+        if cur_id not in prediction['answer']:
			
 
				+            print('missing answer {}'.format(cur_id))
			
 
				+            can_eval_joint = False
			
 
				+        else:
			
 
				+            em, prec, recall = update_answer(
			
 
				+                metrics, prediction['answer'][cur_id], dp['answer'])
			
 
				+        if cur_id not in prediction['sp']:
			
 
				+            print('missing sp fact {}'.format(cur_id))
			
 
				+            can_eval_joint = False
			
 
				+        else:
			
 
				+            sp_em, sp_prec, sp_recall = update_sp(
			
 
				+                metrics, prediction['sp'][cur_id], dp['supporting_facts'])
			
 
				+
			
 
				+        if can_eval_joint:
			
 
				+            joint_prec = prec * sp_prec
			
 
				+            joint_recall = recall * sp_recall
			
 
				+            if joint_prec + joint_recall > 0:
			
 
				+                joint_f1 = 2 * joint_prec * joint_recall / (joint_prec + joint_recall)
			
 
				+            else:
			
 
				+                joint_f1 = 0.
			
 
				+            joint_em = em * sp_em
			
 
				+
			
 
				+            metrics['joint_em'] += joint_em
			
 
				+            metrics['joint_f1'] += joint_f1
			
 
				+            metrics['joint_prec'] += joint_prec
			
 
				+            metrics['joint_recall'] += joint_recall
			
 
				+
			
 
				+    N = len(gold)
			
 
				+    for k in metrics.keys():
			
 
				+        metrics[k] /= N
			
 
				+
			
 
				+    return metrics
			
--- a/recipes/use_cases/end2end-recipes/raft/raft.py
+++ b/recipes/use_cases/end2end-recipes/raft/raft.py
@@ -43,7 +43,7 @@ async def main(context):
 
				 
			
 
				         # Extract format specific params
			
 
				         format_params = {}
			
 
				-        formatter.convert(ds=ds, format=args.output_format, output_path=args.output, output_type=args.output_type, params=format_params)
			
 
				+        formatter.convert(ds=ds, format=args.output_format, output_path=args.output+"raft", output_type=args.output_type, params=format_params)
			
 
				     except Exception as e:
			
 
				         logging.error(f"An unexpected error occurred during the process: {e}",exc_info=True)
			
 
				 
			
--- a/recipes/use_cases/end2end-recipes/raft/raft.yaml
+++ b/recipes/use_cases/end2end-recipes/raft/raft.yaml
@@ -6,14 +6,26 @@ COT_prompt_template: >
 
				         - End your response with final answer in the form <ANSWER>: $answer, the answer should be succinct.
			
 
				         You MUST begin your final answer with the tag "<ANSWER>:
			
 
				 
			
 
				+# question_prompt_template: >
			
 
				+#   You are a synthetic question-answer pair generator. Given a chunk of context about
			
 
				+#   some topic(s), generate {num_questions} example questions a user could ask and would be answered
			
 
				+#   \using information from the chunk. For example, if the given context was a Wikipedia
			
 
				+#   paragraph about the United States, an example question could be 'How many states are
			
 
				+#   in the United States?
			
 
				+#   The questions should be able to be answered in a few words or less. Include only the
			
 
				+#   questions in your response.
			
 
				 question_prompt_template: >
			
 
				-  You are a synthetic question-answer pair generator. Given a chunk of context about
			
 
				-  some topic(s), generate {num_questions} example questions a user could ask and would be answered
			
 
				-  \using information from the chunk. For example, if the given context was a Wikipedia
			
 
				-  paragraph about the United States, an example question could be 'How many states are
			
 
				-  in the United States?
			
 
				-  The questions should be able to be answered in a few words or less. Include only the
			
 
				-  questions in your response.
			
 
				+  You are a language model skilled in creating quiz questions.
			
 
				+  You will be provided with a document,
			
 
				+  read it and please generate question and answer pairs that are most likely be asked by a user of Llama language models,
			
 
				+  which includes LLama, Llama2, Meta Llama3, Code Llama, Meta Llama Guard 1,	Meta Llama Guard 2,
			
 
				+  Output only the questions related to Llama:
			
 
				+  please make sure you follow those rules:
			
 
				+  1. Generate {num_questions} question answer pairs, you can generate less answer if there is nothing related to model, training, fine-tuning and evaluation details of Llama language models, .
			
 
				+  2. The questions can be answered based *solely* on the given passage.
			
 
				+  3. Avoid asking questions with similar meaning.
			
 
				+  4. Never use any abbreviation.
			
 
				+  5. Include only the questions in your response.
			
 
				 
			
 
				 data_dir: "./data"
			
 
				 
			
--- a/recipes/use_cases/end2end-recipes/raft/raft_utils.py
+++ b/recipes/use_cases/end2end-recipes/raft/raft_utils.py
@@ -96,7 +96,7 @@ def read_file_content(context):
 
				                 file_strings.append(file_text)
			
 
				     text = '\n'.join(file_strings)
			
 
				     text = remove_non_printable(text)
			
 
				-    return remove_non_printable(text)
			
 
				+    return text
			
 
				 
			
 
				 def remove_non_printable(s):
			
 
				     printable = set(string.printable)
			
@@ -199,6 +199,7 @@ async def add_chunk_to_dataset(
 
				             COT_tasks.append(generate_COT(chat_service, context, chunk, question))
			
 
				     COT_results = await asyncio.gather(*COT_tasks)
			
 
				     for chunk, q , cot in COT_results:
			
 
				+        # The COT answer will be used in the fine-tuning stage
			
 
				         datapt = {
			
 
				             "id": None,
			
 
				             "type": "general",
			
@@ -237,6 +238,7 @@ async def add_chunk_to_dataset(
 
				         for doc in docs:
			
 
				             context += "<DOCUMENT>" + str(doc) + "</DOCUMENT>\n"
			
 
				         context += q
			
 
				+        # This instruction will be used in the fine-tuning stage
			
 
				         datapt["instruction"] = context
			
 
				 
			
 
				         # add to dataset
			
@@ -253,29 +255,3 @@ async def add_chunk_to_dataset(
 
				         else:
			
 
				             ds = ds.add_item(datapt)
			
 
				     return ds
			
 
				-# This function is used to evaluate the quality of generated QA pairs. Return the original QA pair if the model eval result is YES. Otherwise, return an empty dict.
			
 
				-async def LLM_judge_request(chat_service, api_context: dict, document_content: dict) -> dict:
			
 
				-    prompt_for_system = api_context['judge_prompt_template'].format(language=api_context["language"])
			
 
				-    chat_request_payload = [{'role': 'system', 'content': prompt_for_system}, {'role': 'user', 'content': f"Question: {document_content['Question']} \n Teacher's Answer: {document_content['Ground_truth']}\n Student's Answer: {document_content['Generated_answer']} "}]
			
 
				-    result = await chat_service.execute_chat_request_async(api_context, chat_request_payload)
			
 
				-    if not result:
			
 
				-        return {}
			
 
				-    # no parsing needed, just return the loads the result as a dict
			
 
				-    result = json.loads(result)
			
 
				-    if "Result" not in result:
			
 
				-        print("Error: eval response does not contain answer")
			
 
				-        print(document_content,result)
			
 
				-        return {}
			
 
				-    return result
			
 
				-
			
 
				-async def generate_LLM_eval(chat_service, api_context: dict, judge_list: list):
			
 
				-    eval_tasks = []
			
 
				-    for batch_index, batch_content in enumerate(judge_list):
			
 
				-        try:
			
 
				-            result = LLM_judge_request(chat_service, api_context, batch_content)
			
 
				-            eval_tasks.append(result)
			
 
				-        except Exception as e:
			
 
				-            print(f"Error during data eval request execution: {e}")
			
 
				-
			
 
				-    judge_results = await asyncio.gather(*eval_tasks)
			
 
				-    return judge_results