il y a 2 ans · f44281aaff
--- a/recipes/finetuning/datasets/raft_dataset.py
+++ b/recipes/finetuning/datasets/raft_dataset.py
@@ -7,9 +7,43 @@ import datasets
 
				 from datasets import Dataset, load_dataset, DatasetDict
			
 
				 import itertools
			
 
				 
			
 
				-
			
 
				 B_INST, E_INST = "[INST]", "[/INST]"
			
 
				+def tokenize_dialog(dialog, tokenizer):
			
 
				+    # If vocab size is above 128000, use the chat template to generate the tokens as it is from Llama 3 family models
			
 
				+    if tokenizer.vocab_size >= 128000:
			
 
				+        dialog_tokens = tokenizer.apply_chat_template(dialog)
			
 
				+        dialog_tokens = dialog_tokens[:-4] # Remove generation prompt <|start_header_id|>assistant<|end_header_id|>\n\n
			
 
				+        eot_indices = [i for i,n in enumerate(dialog_tokens) if n == 128009]
			
 
				+        labels = copy.copy(dialog_tokens)
			
 
				+        last_idx = 0
			
 
				+        for n, idx in enumerate(eot_indices):
			
 
				+            if n % 2 == 1:
			
 
				+                last_idx = idx
			
 
				+            else:
			
 
				+                labels[last_idx:idx+1] = [-100] * (idx-last_idx+1)
			
 
				+
			
 
				+        dialog_tokens = [dialog_tokens]
			
 
				+        labels_tokens = [labels]
			
 
				+    else:
			
 
				+        # Otherwise, use the original tokenizer to generate the tokens as it is from Llama 2 family models
			
 
				+        prompt_tokens = [tokenizer.encode(f"{tokenizer.bos_token}{B_INST} {(prompt['content']).strip()} {E_INST}", add_special_tokens=False) for prompt in dialog[:2]]
			
 
				+        answer = dialog[-1]
			
 
				+        answer_tokens = tokenizer.encode(f"{answer['content'].strip()} {tokenizer.eos_token}", add_special_tokens=False)
			
 
				+
			
 
				+        #Add labels, convert prompt token to -100 in order to ignore in loss function
			
 
				+        sample = {
			
 
				+            "input_ids": prompt_tokens + answer_tokens,
			
 
				+            "attention_mask" : [1] * (len(prompt_tokens) + len(answer_tokens)),
			
 
				+            "labels": [-100] * len(prompt_tokens) + answer_tokens,
			
 
				+            }
			
 
				+        return sample
			
 
				 
			
 
				+    combined_tokens = {
			
 
				+        "input_ids": list(itertools.chain(*(t for t in dialog_tokens))),
			
 
				+        "labels": list(itertools.chain(*(t for t in labels_tokens))),
			
 
				+    }
			
 
				+
			
 
				+    return dict(combined_tokens, attention_mask=[1]*len(combined_tokens["input_ids"]))
			
 
				 def raft_tokenize(q_a_pair, tokenizer):
			
 
				     # last line is the question
			
 
				     question = q_a_pair["instruction"].split('\n')[-1]
			
@@ -26,17 +60,12 @@ def raft_tokenize(q_a_pair, tokenizer):
 
				         - End your response with final answer in the form <ANSWER>: $answer, the answer should be succinct.
			
 
				         You MUST begin your final answer with the tag "<ANSWER>:".
			
 
				     """.format(question=question, context=str(documents))
			
 
				-    final_prompt = system_prompt + '\n' + user_prompt
			
 
				-    prompt_tokens = tokenizer.encode(f"{tokenizer.bos_token}{B_INST} {(final_prompt).strip()} {E_INST}", add_special_tokens=False)
			
 
				-    answer_tokens = tokenizer.encode(f"{answer.strip()} {tokenizer.eos_token}", add_special_tokens=False)
			
 
				-    #Add labels, convert prompt token to -100 in order to ignore in loss function
			
 
				-    sample = {
			
 
				-            "input_ids": prompt_tokens + answer_tokens,
			
 
				-            "attention_mask" : [1] * (len(prompt_tokens) + len(answer_tokens)),
			
 
				-            "labels": [-100] * len(prompt_tokens) + answer_tokens,
			
 
				-            }
			
 
				-
			
 
				-    return sample
			
 
				+    chat = [
			
 
				+    {"role": "system", "content": system_prompt},
			
 
				+    {"role": "user", "content": user_prompt},
			
 
				+    {"role": "assistant", "content": answer}
			
 
				+    ]
			
 
				+    return tokenize_dialog(chat, tokenizer)
			
 
				 
			
 
				 
			
 
				 def get_custom_dataset(dataset_config, tokenizer, split, split_ratio=0.8):
			
--- a/recipes/use_cases/end2end-recipes/chatbot/pipelines/evalset.json
+++ b/recipes/use_cases/end2end-recipes/chatbot/pipelines/evalset.json
--- a/recipes/use_cases/end2end-recipes/raft/README.md
+++ b/recipes/use_cases/end2end-recipes/raft/README.md
--- a/recipes/use_cases/end2end-recipes/raft/eval_config.yaml
+++ b/recipes/use_cases/end2end-recipes/raft/eval_config.yaml
@@ -18,6 +18,13 @@ judge_prompt_template: >
 
				       "Reason": "your reason here.",
			
 
				       "Result": "YES or NO."
			
 
				     }}
			
 
				+
			
 
				 eval_json: "./evalset.json"
			
 
				 
			
 
				 language: "English"
			
 
				+
			
 
				+raft_model_name: "raft-8b"
			
 
				+
			
 
				+base_model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
			
 
				+
			
 
				+data_dir: "./data"
			
--- a/recipes/use_cases/end2end-recipes/raft/eval_raft.py
+++ b/recipes/use_cases/end2end-recipes/raft/eval_raft.py
@@ -5,34 +5,37 @@ import logging
 
				 import evaluate
			
 
				 import argparse
			
 
				 from config import load_config
			
 
				-import asyncio
			
 
				 import json
			
 
				 from itertools import chain
			
 
				-from generator_utils import parse_qa_to_json, generate_LLM_eval
			
 
				-from langchain_community.llms import VLLM
			
 
				+from langchain_community.llms import VLLMOpenAI
			
 
				 from langchain_community.embeddings import HuggingFaceEmbeddings
			
 
				 from langchain_community.vectorstores import FAISS
			
 
				 from langchain.text_splitter import RecursiveCharacterTextSplitter
			
 
				 from langchain_community.document_loaders import DirectoryLoader
			
 
				 from langchain.chains import RetrievalQA
			
 
				+from langchain_core.messages import HumanMessage, SystemMessage
			
 
				+import re
			
 
				+import string
			
 
				+from collections import Counter
			
 
				 
			
 
				-from eval_utils import exact_match_score
			
 
				-def generate_answers_model_only(model_path):
			
 
				+def generate_answers_model_only(model_name,question_list,api_url="http://localhost:8000/v1",key="EMPTY"):
			
 
				         # Use langchain to load the documents from data directory
			
 
				     # Load the RAFT model
			
 
				-    llm = VLLM(model=model_path,
			
 
				-           trust_remote_code=True,  # mandatory for hf models
			
 
				-           max_new_tokens=500,
			
 
				-           top_p=1,
			
 
				-           temperature=0.0,
			
 
				-           # tensor_parallel_size=... # for distributed inference
			
 
				-        )
			
 
				+    llm = VLLMOpenAI(
			
 
				+        openai_api_key=key,
			
 
				+        openai_api_base=api_url,
			
 
				+        model_name=model_name,
			
 
				+        model_kwargs={"stop": ["."]},
			
 
				+        temperature=0.0,)
			
 
				     generated_answers = []
			
 
				     for question in question_list:
			
 
				-        result = llm.invoke(question)
			
 
				-        generated_answers.append(result["answer"])
			
 
				+        response = llm.invoke(question)
			
 
				+        generated_answers.append(response)
			
 
				+    if len(generated_answers) == 0:
			
 
				+        logging.error("No model answers generated. Please check the input context or model configuration in ",model_name)
			
 
				+        return []
			
 
				     return generated_answers
			
 
				-def generate_answers_with_RAG(model_path, data_dir,question_list):
			
 
				+def generate_answers_with_RAG(model_name, data_dir,question_list, api_url="http://localhost:8000/v1",key="EMPTY"):
			
 
				     # Use langchain to load the documents from data directory
			
 
				     loader = DirectoryLoader(data_dir)
			
 
				     docs = loader.load()
			
@@ -43,13 +46,12 @@ def generate_answers_with_RAG(model_path, data_dir,question_list):
 
				     # Store the document into a vector store with a specific embedding model
			
 
				     vectorstore = FAISS.from_documents(all_splits, HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2"))
			
 
				     # Load the RAFT model
			
 
				-    llm = VLLM(model=model_path,
			
 
				-           trust_remote_code=True,  # mandatory for hf models
			
 
				-           max_new_tokens=500,
			
 
				-           top_p=1,
			
 
				-           temperature=0.0,
			
 
				-           # tensor_parallel_size=... # for distributed inference
			
 
				-        )
			
 
				+    llm = VLLMOpenAI(
			
 
				+        openai_api_key=key,
			
 
				+        openai_api_base=api_url,
			
 
				+        model_name=model_name,
			
 
				+        model_kwargs={"stop": ["."]},
			
 
				+        temperature=0.0,)
			
 
				     # Create a RetrievalQA chain with the vector store and RAFT model
			
 
				     qa_chain = RetrievalQA.from_chain_type(
			
 
				     llm,
			
@@ -57,10 +59,13 @@ def generate_answers_with_RAG(model_path, data_dir,question_list):
 
				     )
			
 
				     generated_answers = []
			
 
				     for question in question_list:
			
 
				-        result = qa_chain({"query": question})
			
 
				-        generated_answers.append(result["answer"])
			
 
				+        response = qa_chain({"query": question})
			
 
				+        generated_answers.append(response['result'])
			
 
				+    if len(generated_answers) == 0:
			
 
				+        logging.error("No RAG answers generated. Please check the input context or model configuration in ",model_name)
			
 
				+        return []
			
 
				     return generated_answers
			
 
				-def compute_rouge_score(generated : str, reference: str):
			
 
				+def compute_rouge_score(generated : list, reference: list):
			
 
				     rouge_score = evaluate.load('rouge')
			
 
				     return rouge_score.compute(
			
 
				         predictions=generated,
			
@@ -68,7 +73,41 @@ def compute_rouge_score(generated : str, reference: str):
 
				         use_stemmer=True,
			
 
				         use_aggregator=True
			
 
				     )
			
 
				-def compute_bert_score(generated : str, reference: str):
			
 
				+def remove_special_tokens(text_list):
			
 
				+    clean_text_list = []
			
 
				+    for text in text_list:
			
 
				+        text = text.replace("##begin_quote##","")
			
 
				+        text = text.replace("##end_quote##","")
			
 
				+        text = text.strip()
			
 
				+        clean_text_list.append(text)
			
 
				+    return clean_text_list
			
 
				+
			
 
				+def normalize_answer(s):
			
 
				+
			
 
				+    def remove_articles(text):
			
 
				+        return re.sub(r'\b(a|an|the)\b', ' ', text)
			
 
				+
			
 
				+    def white_space_fix(text):
			
 
				+        return ' '.join(text.split())
			
 
				+
			
 
				+    def remove_punc(text):
			
 
				+        exclude = set(string.punctuation)
			
 
				+        return ''.join(ch for ch in text if ch not in exclude)
			
 
				+
			
 
				+    def lower(text):
			
 
				+        return text.lower()
			
 
				+
			
 
				+    return white_space_fix(remove_articles(remove_punc(lower(s))))
			
 
				+def exact_match_score(prediction, ground_truth):
			
 
				+    """Computes EM score for a single prediction and ground truth answer."""
			
 
				+    num_match = 0
			
 
				+    assert len(prediction) == len(ground_truth), "Answer length does not match prediction length."
			
 
				+    assert(len(ground_truth) > 0)
			
 
				+    for idx, (pred,gold) in enumerate(zip(prediction, ground_truth)):
			
 
				+        if (normalize_answer(pred) == normalize_answer(gold)):
			
 
				+            num_match += 1
			
 
				+    return num_match/len(ground_truth)
			
 
				+def compute_bert_score(generated : list, reference: list):
			
 
				     bertscore = evaluate.load("bertscore")
			
 
				     score = bertscore.compute(
			
 
				         predictions=generated,
			
@@ -79,44 +118,65 @@ def compute_bert_score(generated : str, reference: str):
 
				     precision = score["precision"]
			
 
				     recall = score["recall"]
			
 
				     return sum(precision)/len(precision), sum(recall)/len(recall), sum(f1)/len(f1)
			
 
				-# This function is used to eval the fine-tuned model, given the question, generate the answer.
			
 
				-async def eval_request(chat_service, api_context: dict, question: str) -> dict:
			
 
				-    prompt_for_system = api_context['eval_prompt_template'].format(language=api_context["language"])
			
 
				-    chat_request_payload = [{'role': 'system', 'content': prompt_for_system}, {'role': 'user', 'content': f"Question: {question}"}]
			
 
				-    # Getting a list of result, in this case, there should be only one result
			
 
				-    response_string = await chat_service.execute_chat_request_async(api_context, chat_request_payload)
			
 
				-    # convert the result string to a dict that contains Question, Answer
			
 
				-    result_list = parse_qa_to_json(response_string)
			
 
				-    if not result_list or len(result_list) > 1:
			
 
				-        print("Error: eval response should be a list of one result dict")
			
 
				-        return {}
			
 
				-    result = result_list[0]
			
 
				-    if "Answer" not in result:
			
 
				-        print("Error: eval response does not contain answer")
			
 
				-        return {}
			
 
				-    # Send back the model generated answer
			
 
				-
			
 
				-    return result["Answer"]
			
 
				-
			
 
				-async def generate_eval_answer(chat_service, api_context: dict, questions: list):
			
 
				-    eval_tasks = []
			
 
				-    for batch_index, question in enumerate(questions):
			
 
				-        try:
			
 
				-            result = eval_request(chat_service, api_context, question)
			
 
				-            eval_tasks.append(result)
			
 
				-        except Exception as e:
			
 
				-            print(f"Error during data eval request execution: {e}")
			
 
				-    print(len(eval_tasks),"eval_tasks")
			
 
				-    eval_results = await asyncio.gather(*eval_tasks)
			
 
				-
			
 
				-    return eval_results
			
 
				-
			
 
				-async def main(context):
			
 
				-    if context["endpoint"]:
			
 
				-        chat_service = VllmChatService()
			
 
				-    else:
			
 
				-        chat_service = OctoAIChatService()
			
 
				+def compute_judge_score(questions: list, generated : list, reference: list, context,api_url="http://localhost:8001/v1",key="EMPTY"):
			
 
				+    correct_num = 0
			
 
				+    model_name = "meta-llama/Meta-Llama-3-70B-Instruct"
			
 
				+    llm = VLLMOpenAI(
			
 
				+        openai_api_key=key,
			
 
				+        openai_api_base=api_url,
			
 
				+        model_name=model_name,
			
 
				+        model_kwargs={"stop": ["."]},
			
 
				+        temperature=0.0,)
			
 
				+    for q,pred,gold in zip(questions, generated,reference):
			
 
				+        # messages = [
			
 
				+        #     SystemMessage(content=context['judge_prompt_template']),
			
 
				+        #     HumanMessage(content=f"Question: {q} \n Teacher's Answer: {gold} \n Student's Answer: {pred} "),
			
 
				+        # ]
			
 
				+        messages = context['judge_prompt_template'] + "\n"
			
 
				+        messages += f"Question: {q} \n Teacher's Answer: {gold} \n Student's Answer: {pred} "
			
 
				+        response = llm.invoke(messages)
			
 
				+        print(response+ " -------------")
			
 
				+        result = json.loads(response)
			
 
				+        if "Result" not in result:
			
 
				+            print("Error: eval response does not contain answer")
			
 
				+            print(result)
			
 
				+            continue
			
 
				+        correct_num += result["Result"] == "YES"
			
 
				+    return correct_num/len(questions)
			
 
				+def score_single(context,generated,reference,questions, run_exact_match=True,run_rouge=True, run_bert=True, run_llm_as_judge=True):
			
 
				+    # set metric to default -1, means no metric is computed
			
 
				+    metric = {
			
 
				+        "Rouge_score": -1,
			
 
				+        "BERTScore_Precision": -1,
			
 
				+        "BERTScore_Recall": -1,
			
 
				+        "BERTScore_F1": -1,
			
 
				+        "LLM_judge_score": -1,
			
 
				+        "Exact_match": -1
			
 
				+    }
			
 
				+    if run_rouge:
			
 
				+        rouge_score = compute_rouge_score(generated,reference)
			
 
				+        metric["Rouge_score"] = rouge_score
			
 
				+        print("Rouge_score:",rouge_score)
			
 
				+    if run_bert:
			
 
				+        P, R, F1 = compute_bert_score(generated,reference)
			
 
				+        print(f"BERTScore Precision: {P:.4f}, Recall: {R:.4f}, F1: {F1:.4f}")
			
 
				+        metric["BERTScore_Precision"] = P
			
 
				+        metric["BERTScore_Recall"] = R
			
 
				+        metric["BERTScore_F1"] = F1
			
 
				+    if context["judge_endpoint"] and run_llm_as_judge:
			
 
				+        api_url = "http://localhost:"+str(context["judge_endpoint"])+"/v1"
			
 
				+        LLM_judge_score = compute_judge_score(questions, generated, reference, context,api_url=api_url)
			
 
				+        metric["LLM_judge_score"] = LLM_judge_score
			
 
				+        print(f"LLM_judge_score: {LLM_judge_score}")
			
 
				+    if run_exact_match:
			
 
				+        exact_match = exact_match_score(generated,reference)
			
 
				+        print(f"Exact_match_percentage: {exact_match:.4f}")
			
 
				+        metric["Exact_match"] = exact_match
			
 
				+    return metric
			
 
				+def main(context):
			
 
				+    # Since the eval set is small, we can run the eval without async functions
			
 
				     try:
			
 
				+        api_url = "http://localhost:"+str(context["vllm_endpoint"])+"/v1"
			
 
				         logging.info("Starting to generate answer given the eval set.")
			
 
				         with open(context["eval_json"]) as fp:
			
 
				             eval_json = json.load(fp)
			
@@ -124,49 +184,47 @@ async def main(context):
 
				         for index, item in enumerate(eval_json):
			
 
				             questions.append(item["question"])
			
 
				             groud_truth.append(item["answer"])
			
 
				-        generated_answers = generate_answers_with_RAG(model_path, context,questions)
			
 
				-        if not generated_answers:
			
 
				-            logging.warning("No answers generated. Please check the input context or model configuration.")
			
 
				-            return
			
 
				-        logging.info(f"Successfully generated {len(generated_answers)} answers.")
			
 
				-        judge_list = []
			
 
				-        for index, item in enumerate(generated_answers):
			
 
				-            judge_list.append({"Question":questions[index],"Ground_truth":groud_truth[index],"Generated_answer":generated_answers[index]})
			
 
				-        if context["judge_endpoint"]:
			
 
				-            # make a copy of the context then change the VLLM endpoint to judge_endpoint
			
 
				-            context_copy = dict(context)
			
 
				-            context_copy["endpoint"] = context["judge_endpoint"]
			
 
				-            context_copy["model"] = "meta-llama/Meta-Llama-3-70B-Instruct"
			
 
				-            judge_results = await generate_LLM_eval(chat_service, context_copy, judge_list)
			
 
				-            correct_num = 0
			
 
				-            for result in judge_results:
			
 
				-                correct_num += result["Result"] == "YES"
			
 
				-            LLM_judge_score = correct_num/len(judge_results)
			
 
				-            print(f"The accuracy of the model is {LLM_judge_score}")
			
 
				-        rouge_score = compute_rouge_score(generated_answers,groud_truth)
			
 
				-        print("Rouge_score:",rouge_score)
			
 
				-        P, R, F1 = compute_bert_score(generated_answers,groud_truth)
			
 
				-        print(f"BERTScore Precision: {P:.4f}, Recall: {R:.4f}, F1: {F1:.4f}")
			
 
				-        exact_match = 0
			
 
				-        for item in judge_list:
			
 
				-            exact_match += exact_match_score(item['Generated_answer'],item['Ground_truth'])
			
 
				-        exact_match_percentage = exact_match/len(judge_list)
			
 
				-        print(f"Exact_match_percentage: {exact_match_percentage:.4f}")
			
 
				-        # Saving the eval result to a log file
			
 
				-        with open(context["output_log"],"a") as fp:
			
 
				-            fp.write(f"Eval_result for {context['model']} \n")
			
 
				-            fp.write(f"Rouge_score: {rouge_score} \n")
			
 
				-            fp.write(f"BERTScore Precision: {P:.4f}, Recall: {R:.4f}, F1: {F1:.4f} \n")
			
 
				-            fp.write(f"Exact_match_percentage: {exact_match_percentage} \n")
			
 
				-            if context["judge_endpoint"]:
			
 
				-                fp.write(f"LLM_judge_score: {LLM_judge_score} \n")
			
 
				-            fp.write(f"QA details: \n")
			
 
				-            for item in judge_list:
			
 
				-                fp.write(f"question: {item['Question']} \n")
			
 
				-                fp.write(f"generated_answers: {item['Generated_answer']} \n")
			
 
				-                fp.write(f"groud_truth: {item['Ground_truth']} \n")
			
 
				-                fp.write("\n")
			
 
				+        generated_answers = {
			
 
				+            "RAFT": [],
			
 
				+            "RAFT_RAG": [],
			
 
				+            "Baseline": [],
			
 
				+            "Baseline_RAG": [],
			
 
				+        }
			
 
				+        # Generate answers for baseline
			
 
				+        base_model_name = context["base_model_name"]
			
 
				+        generated_answers["Baseline"] = generate_answers_model_only(base_model_name,questions,api_url)
			
 
				+        #generated_answers["Baseline_RAG"] = generate_answers_with_RAG(base_model_name, context["data_dir"],questions,api_url)
			
 
				+        # Generate answers for RAFT
			
 
				+        raft_model_name = context["raft_model_name"]
			
 
				+        #generated_answers["RAFT"] = generate_answers_model_only(raft_model_name,questions,api_url)
			
 
				+        #generated_answers["RAFT_RAG"] = generate_answers_with_RAG(raft_model_name, context["data_dir"],questions,api_url)
			
 
				+        # clean special tokens from the RAFT generated answer
			
 
				+        #generated_answers["RAFT"] = remove_special_tokens(generated_answers["RAFT"])
			
 
				+        #generated_answers["RAFT_RAG"] = remove_special_tokens(generated_answers["RAFT_RAG"])
			
 
				+        logging.info(f"Successfully generated {len(generated_answers['Baseline_RAG'])} answers for all models.")
			
 
				+        # for generate answer from each model, compute the score metric
			
 
				+        for model_name,model_answer in generated_answers.items():
			
 
				+            if len(model_answer) != len(groud_truth):
			
 
				+                print(f"The length of {model_name} answer is not equal to the length of ground truth.")
			
 
				+                continue
			
 
				+            metric = score_single(context,model_answer,groud_truth,questions)
			
 
				+            print(f"The eval result for {model_name} is: {metric}")
			
 
				+            with open(context["output_log"],"a") as fp:
			
 
				+                fp.write(f"Eval_result for {model_name} \n")
			
 
				+                fp.write(f"Rouge_score: {metric['Rouge_score']} \n")
			
 
				+                fp.write(f"BERTScore Precision: {metric['BERTScore_Precision']:.4f}, Recall: {metric['BERTScore_Recall']:.4f}, F1: {metric['BERTScore_F1']:.4f} \n")
			
 
				+                fp.write(f"Exact_match_percentage: {metric['Exact_match']} \n")
			
 
				+                if context["judge_endpoint"]:
			
 
				+                    fp.write(f"LLM_judge_score: {metric['LLM_judge_score']} \n")
			
 
				+                fp.write(f"QA details: \n")
			
 
				+                for item in zip(questions,model_answer,groud_truth):
			
 
				+                    fp.write(f"question: {item[0]} \n")
			
 
				+                    fp.write(f"generated_answers: {item[1]} \n")
			
 
				+                    fp.write(f"groud_truth: {item[2]} \n")
			
 
				+                    fp.write("\n")
			
 
				+                fp.write("\n------------------------------------\n")
			
 
				         logging.info(f"Eval successfully, the eval result is saved to {context['output_log']}.")
			
 
				+        # Saving the eval result to a log file
			
 
				     except Exception as e:
			
 
				         logging.error(f"An unexpected error occurred during the process: {e}",exc_info=True)
			
 
				 
			
@@ -176,9 +234,9 @@ def parse_arguments():
 
				         description="Generate question/answer pairs from documentation."
			
 
				     )
			
 
				     parser.add_argument(
			
 
				-        "-m", "--model",
			
 
				-        default="chatbot",
			
 
				-        help="Select the model to use for evaluation, this maybe a LoRA adapter."
			
 
				+        "-m", "--raft_model_name",
			
 
				+        default=None,
			
 
				+        help="Provide the raft_model_name to use for evaluation. If not specified, the model_path in eval_config.yaml will be used."
			
 
				     )
			
 
				     parser.add_argument(
			
 
				         "-c", "--config_path",
			
@@ -186,10 +244,15 @@ def parse_arguments():
 
				         help="Set the configuration file path that has system prompt along with language, evalset path."
			
 
				     )
			
 
				     parser.add_argument(
			
 
				-        "-v", "--vllm_endpoint",
			
 
				+        "-d", "--data_dir",
			
 
				         default=None,
			
 
				+        help="Provide the data folder path to build RAG for evaluation. If not specified, the data_dir in eval_config.yaml will be used."
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "-v", "--vllm_endpoint",
			
 
				+        default=8000,
			
 
				         type=int,
			
 
				-        help="If a port is specified, then use local vllm endpoint for evaluations."
			
 
				+        help="If a port is specified, then use local vllm endpoint for eval."
			
 
				     )
			
 
				     parser.add_argument(
			
 
				         "-j", "--judge_endpoint",
			
@@ -202,18 +265,20 @@ def parse_arguments():
 
				         default="eval_result.log",
			
 
				         help="save the eval result to a log file. Default is eval_result.log"
			
 
				     )
			
 
				+
			
 
				     return parser.parse_args()
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				     logging.info("Initializing the process and loading configuration...")
			
 
				     args = parse_arguments()
			
 
				     context = load_config(args.config_path)
			
 
				-    context["model"] = args.model
			
 
				-    context["endpoint"] = args.vllm_endpoint
			
 
				+    context["vllm_endpoint"] = args.vllm_endpoint
			
 
				+    if args.data_dir:
			
 
				+        context["data_dir"] = args.data_dir
			
 
				+    if args.raft_model_name:
			
 
				+        context["raft_model_name"] = args.raft_model_name
			
 
				     context["judge_endpoint"] = args.judge_endpoint
			
 
				     context["output_log"] = args.output_log
			
 
				-    if context["endpoint"]:
			
 
				-        logging.info(f"Use local vllm service for eval at port: '{args.vllm_endpoint}'.")
			
 
				     if context["judge_endpoint"]:
			
 
				         logging.info(f"Use local vllm service for judge at port: '{args.judge_endpoint}'.")
			
 
				-    asyncio.run(main(context))
			
 
				+    main(context)
			
--- a/recipes/use_cases/end2end-recipes/raft/eval_utils.py
+++ b/recipes/use_cases/end2end-recipes/raft/eval_utils.py
@@ -1,122 +0,0 @@
 
				-import sys
			
 
				-import ujson as json
			
 
				-import re
			
 
				-import string
			
 
				-from collections import Counter
			
 
				-import pickle
			
 
				-
			
 
				-def normalize_answer(s):
			
 
				-
			
 
				-    def remove_articles(text):
			
 
				-        return re.sub(r'\b(a|an|the)\b', ' ', text)
			
 
				-
			
 
				-    def white_space_fix(text):
			
 
				-        return ' '.join(text.split())
			
 
				-
			
 
				-    def remove_punc(text):
			
 
				-        exclude = set(string.punctuation)
			
 
				-        return ''.join(ch for ch in text if ch not in exclude)
			
 
				-
			
 
				-    def lower(text):
			
 
				-        return text.lower()
			
 
				-
			
 
				-    return white_space_fix(remove_articles(remove_punc(lower(s))))
			
 
				-
			
 
				-
			
 
				-def f1_score(prediction, ground_truth):
			
 
				-    normalized_prediction = normalize_answer(prediction)
			
 
				-    normalized_ground_truth = normalize_answer(ground_truth)
			
 
				-
			
 
				-    ZERO_METRIC = (0, 0, 0)
			
 
				-
			
 
				-    if normalized_prediction in ['yes', 'no', 'noanswer'] and normalized_prediction != normalized_ground_truth:
			
 
				-        return ZERO_METRIC
			
 
				-    if normalized_ground_truth in ['yes', 'no', 'noanswer'] and normalized_prediction != normalized_ground_truth:
			
 
				-        return ZERO_METRIC
			
 
				-
			
 
				-    prediction_tokens = normalized_prediction.split()
			
 
				-    ground_truth_tokens = normalized_ground_truth.split()
			
 
				-    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
			
 
				-    num_same = sum(common.values())
			
 
				-    if num_same == 0:
			
 
				-        return ZERO_METRIC
			
 
				-    precision = 1.0 * num_same / len(prediction_tokens)
			
 
				-    recall = 1.0 * num_same / len(ground_truth_tokens)
			
 
				-    f1 = (2 * precision * recall) / (precision + recall)
			
 
				-    return f1, precision, recall
			
 
				-
			
 
				-
			
 
				-def exact_match_score(prediction, ground_truth):
			
 
				-    return (normalize_answer(prediction) == normalize_answer(ground_truth))
			
 
				-
			
 
				-def update_answer(metrics, prediction, gold):
			
 
				-    em = exact_match_score(prediction, gold)
			
 
				-    f1, prec, recall = f1_score(prediction, gold)
			
 
				-    metrics['em'] += float(em)
			
 
				-    metrics['f1'] += f1
			
 
				-    metrics['prec'] += prec
			
 
				-    metrics['recall'] += recall
			
 
				-    return em, prec, recall
			
 
				-
			
 
				-def update_sp(metrics, prediction, gold):
			
 
				-    cur_sp_pred = set(map(tuple, prediction))
			
 
				-    gold_sp_pred = set(map(tuple, gold))
			
 
				-    tp, fp, fn = 0, 0, 0
			
 
				-    for e in cur_sp_pred:
			
 
				-        if e in gold_sp_pred:
			
 
				-            tp += 1
			
 
				-        else:
			
 
				-            fp += 1
			
 
				-    for e in gold_sp_pred:
			
 
				-        if e not in cur_sp_pred:
			
 
				-            fn += 1
			
 
				-    prec = 1.0 * tp / (tp + fp) if tp + fp > 0 else 0.0
			
 
				-    recall = 1.0 * tp / (tp + fn) if tp + fn > 0 else 0.0
			
 
				-    f1 = 2 * prec * recall / (prec + recall) if prec + recall > 0 else 0.0
			
 
				-    em = 1.0 if fp + fn == 0 else 0.0
			
 
				-    metrics['sp_em'] += em
			
 
				-    metrics['sp_f1'] += f1
			
 
				-    metrics['sp_prec'] += prec
			
 
				-    metrics['sp_recall'] += recall
			
 
				-    return em, prec, recall
			
 
				-
			
 
				-def eval(prediction, gold):
			
 
				-
			
 
				-    metrics = {'em': 0, 'f1': 0, 'prec': 0, 'recall': 0,
			
 
				-        'sp_em': 0, 'sp_f1': 0, 'sp_prec': 0, 'sp_recall': 0,
			
 
				-        'joint_em': 0, 'joint_f1': 0, 'joint_prec': 0, 'joint_recall': 0}
			
 
				-    for dp in gold:
			
 
				-        cur_id = dp['_id']
			
 
				-        can_eval_joint = True
			
 
				-        if cur_id not in prediction['answer']:
			
 
				-            print('missing answer {}'.format(cur_id))
			
 
				-            can_eval_joint = False
			
 
				-        else:
			
 
				-            em, prec, recall = update_answer(
			
 
				-                metrics, prediction['answer'][cur_id], dp['answer'])
			
 
				-        if cur_id not in prediction['sp']:
			
 
				-            print('missing sp fact {}'.format(cur_id))
			
 
				-            can_eval_joint = False
			
 
				-        else:
			
 
				-            sp_em, sp_prec, sp_recall = update_sp(
			
 
				-                metrics, prediction['sp'][cur_id], dp['supporting_facts'])
			
 
				-
			
 
				-        if can_eval_joint:
			
 
				-            joint_prec = prec * sp_prec
			
 
				-            joint_recall = recall * sp_recall
			
 
				-            if joint_prec + joint_recall > 0:
			
 
				-                joint_f1 = 2 * joint_prec * joint_recall / (joint_prec + joint_recall)
			
 
				-            else:
			
 
				-                joint_f1 = 0.
			
 
				-            joint_em = em * sp_em
			
 
				-
			
 
				-            metrics['joint_em'] += joint_em
			
 
				-            metrics['joint_f1'] += joint_f1
			
 
				-            metrics['joint_prec'] += joint_prec
			
 
				-            metrics['joint_recall'] += joint_recall
			
 
				-
			
 
				-    N = len(gold)
			
 
				-    for k in metrics.keys():
			
 
				-        metrics[k] /= N
			
 
				-
			
 
				-    return metrics
			
--- a/recipes/use_cases/end2end-recipes/raft/evalset.json
+++ b/recipes/use_cases/end2end-recipes/raft/evalset.json
--- a/requirements.txt
+++ b/requirements.txt
@@ -32,3 +32,5 @@ python-dotenv==1.0.1
 
				 pyyaml==6.0.1
			
 
				 coloredlogs==15.0.1
			
 
				 sentence_transformers
			
 
				+faiss-gpu
			
 
				+unstructured[pdf]