| 
															
																@@ -0,0 +1,219 @@ 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+# Copyright (c) Meta Platforms, Inc. and affiliates. 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+# This software may be used and distributed according to the terms of the Llama 3 Community License Agreement. 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+from chat_utils import OctoAIChatService, VllmChatService 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+import logging 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+import evaluate 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+import argparse 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+from config import load_config 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+import asyncio 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+import json 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+from itertools import chain 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+from generator_utils import parse_qa_to_json, generate_LLM_eval 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+from langchain_community.llms import VLLM 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+from langchain_community.embeddings import HuggingFaceEmbeddings 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+from langchain_community.vectorstores import FAISS 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+from langchain.text_splitter import RecursiveCharacterTextSplitter 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+from langchain_community.document_loaders import DirectoryLoader 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+from langchain.chains import RetrievalQA 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+ 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+from eval_utils import exact_match_score 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+def generate_answers_model_only(model_path): 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        # Use langchain to load the documents from data directory 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    # Load the RAFT model 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    llm = VLLM(model=model_path, 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+           trust_remote_code=True,  # mandatory for hf models 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+           max_new_tokens=500, 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+           top_p=1, 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+           temperature=0.0, 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+           # tensor_parallel_size=... # for distributed inference 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        ) 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    generated_answers = [] 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    for question in question_list: 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        result = llm.invoke(question) 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        generated_answers.append(result["answer"]) 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    return generated_answers 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+def generate_answers_with_RAG(model_path, data_dir,question_list): 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    # Use langchain to load the documents from data directory 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    loader = DirectoryLoader(data_dir) 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    docs = loader.load() 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    # Split the document into chunks with a specified chunk size 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    all_splits = text_splitter.split_documents(docs) 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+ 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    # Store the document into a vector store with a specific embedding model 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    vectorstore = FAISS.from_documents(all_splits, HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")) 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    # Load the RAFT model 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    llm = VLLM(model=model_path, 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+           trust_remote_code=True,  # mandatory for hf models 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+           max_new_tokens=500, 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+           top_p=1, 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+           temperature=0.0, 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+           # tensor_parallel_size=... # for distributed inference 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        ) 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    # Create a RetrievalQA chain with the vector store and RAFT model 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    qa_chain = RetrievalQA.from_chain_type( 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    llm, 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    retriever=vectorstore.as_retriever() 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    ) 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    generated_answers = [] 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    for question in question_list: 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        result = qa_chain({"query": question}) 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        generated_answers.append(result["answer"]) 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    return generated_answers 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+def compute_rouge_score(generated : str, reference: str): 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    rouge_score = evaluate.load('rouge') 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    return rouge_score.compute( 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        predictions=generated, 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        references=reference, 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        use_stemmer=True, 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        use_aggregator=True 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    ) 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+def compute_bert_score(generated : str, reference: str): 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    bertscore = evaluate.load("bertscore") 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    score = bertscore.compute( 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        predictions=generated, 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        references=reference, 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        lang="en" 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    ) 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    f1 = score["f1"] 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    precision = score["precision"] 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    recall = score["recall"] 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    return sum(precision)/len(precision), sum(recall)/len(recall), sum(f1)/len(f1) 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+# This function is used to eval the fine-tuned model, given the question, generate the answer. 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+async def eval_request(chat_service, api_context: dict, question: str) -> dict: 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    prompt_for_system = api_context['eval_prompt_template'].format(language=api_context["language"]) 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    chat_request_payload = [{'role': 'system', 'content': prompt_for_system}, {'role': 'user', 'content': f"Question: {question}"}] 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    # Getting a list of result, in this case, there should be only one result 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    response_string = await chat_service.execute_chat_request_async(api_context, chat_request_payload) 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    # convert the result string to a dict that contains Question, Answer 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    result_list = parse_qa_to_json(response_string) 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    if not result_list or len(result_list) > 1: 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        print("Error: eval response should be a list of one result dict") 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        return {} 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    result = result_list[0] 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    if "Answer" not in result: 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        print("Error: eval response does not contain answer") 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        return {} 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    # Send back the model generated answer 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+ 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    return result["Answer"] 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+ 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+async def generate_eval_answer(chat_service, api_context: dict, questions: list): 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    eval_tasks = [] 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    for batch_index, question in enumerate(questions): 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        try: 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+            result = eval_request(chat_service, api_context, question) 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+            eval_tasks.append(result) 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        except Exception as e: 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+            print(f"Error during data eval request execution: {e}") 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    print(len(eval_tasks),"eval_tasks") 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    eval_results = await asyncio.gather(*eval_tasks) 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+ 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    return eval_results 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+ 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+async def main(context): 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    if context["endpoint"]: 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        chat_service = VllmChatService() 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    else: 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        chat_service = OctoAIChatService() 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    try: 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        logging.info("Starting to generate answer given the eval set.") 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        with open(context["eval_json"]) as fp: 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+            eval_json = json.load(fp) 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        questions,groud_truth = [],[] 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        for index, item in enumerate(eval_json): 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+            questions.append(item["question"]) 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+            groud_truth.append(item["answer"]) 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        generated_answers = generate_answers_with_RAG(model_path, context,questions) 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        if not generated_answers: 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+            logging.warning("No answers generated. Please check the input context or model configuration.") 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+            return 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        logging.info(f"Successfully generated {len(generated_answers)} answers.") 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        judge_list = [] 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        for index, item in enumerate(generated_answers): 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+            judge_list.append({"Question":questions[index],"Ground_truth":groud_truth[index],"Generated_answer":generated_answers[index]}) 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        if context["judge_endpoint"]: 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+            # make a copy of the context then change the VLLM endpoint to judge_endpoint 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+            context_copy = dict(context) 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+            context_copy["endpoint"] = context["judge_endpoint"] 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+            context_copy["model"] = "meta-llama/Meta-Llama-3-70B-Instruct" 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+            judge_results = await generate_LLM_eval(chat_service, context_copy, judge_list) 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+            correct_num = 0 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+            for result in judge_results: 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+                correct_num += result["Result"] == "YES" 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+            LLM_judge_score = correct_num/len(judge_results) 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+            print(f"The accuracy of the model is {LLM_judge_score}") 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        rouge_score = compute_rouge_score(generated_answers,groud_truth) 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        print("Rouge_score:",rouge_score) 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        P, R, F1 = compute_bert_score(generated_answers,groud_truth) 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        print(f"BERTScore Precision: {P:.4f}, Recall: {R:.4f}, F1: {F1:.4f}") 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        exact_match = 0 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        for item in judge_list: 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+            exact_match += exact_match_score(item['Generated_answer'],item['Ground_truth']) 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        exact_match_percentage = exact_match/len(judge_list) 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        print(f"Exact_match_percentage: {exact_match_percentage:.4f}") 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        # Saving the eval result to a log file 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        with open(context["output_log"],"a") as fp: 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+            fp.write(f"Eval_result for {context['model']} \n") 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+            fp.write(f"Rouge_score: {rouge_score} \n") 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+            fp.write(f"BERTScore Precision: {P:.4f}, Recall: {R:.4f}, F1: {F1:.4f} \n") 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+            fp.write(f"Exact_match_percentage: {exact_match_percentage} \n") 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+            if context["judge_endpoint"]: 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+                fp.write(f"LLM_judge_score: {LLM_judge_score} \n") 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+            fp.write(f"QA details: \n") 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+            for item in judge_list: 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+                fp.write(f"question: {item['Question']} \n") 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+                fp.write(f"generated_answers: {item['Generated_answer']} \n") 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+                fp.write(f"groud_truth: {item['Ground_truth']} \n") 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+                fp.write("\n") 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        logging.info(f"Eval successfully, the eval result is saved to {context['output_log']}.") 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    except Exception as e: 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        logging.error(f"An unexpected error occurred during the process: {e}",exc_info=True) 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+ 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+def parse_arguments(): 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    # Define command line arguments for the script 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    parser = argparse.ArgumentParser( 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        description="Generate question/answer pairs from documentation." 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    ) 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    parser.add_argument( 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        "-m", "--model", 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        default="chatbot", 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        help="Select the model to use for evaluation, this maybe a LoRA adapter." 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    ) 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    parser.add_argument( 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        "-c", "--config_path", 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        default="eval_config.yaml", 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        help="Set the configuration file path that has system prompt along with language, evalset path." 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    ) 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    parser.add_argument( 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        "-v", "--vllm_endpoint", 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        default=None, 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        type=int, 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        help="If a port is specified, then use local vllm endpoint for evaluations." 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    ) 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    parser.add_argument( 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        "-j", "--judge_endpoint", 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        default=None, 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        type=int, 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        help="If a port is specified, then use local vllm endpoint as judge LLM." 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    ) 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    parser.add_argument( 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        "-o", "--output_log", 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        default="eval_result.log", 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        help="save the eval result to a log file. Default is eval_result.log" 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    ) 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    return parser.parse_args() 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+ 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+if __name__ == "__main__": 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    logging.info("Initializing the process and loading configuration...") 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    args = parse_arguments() 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    context = load_config(args.config_path) 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    context["model"] = args.model 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    context["endpoint"] = args.vllm_endpoint 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    context["judge_endpoint"] = args.judge_endpoint 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    context["output_log"] = args.output_log 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    if context["endpoint"]: 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        logging.info(f"Use local vllm service for eval at port: '{args.vllm_endpoint}'.") 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    if context["judge_endpoint"]: 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+        logging.info(f"Use local vllm service for judge at port: '{args.judge_endpoint}'.") 
															 | 
														
													
												
													
														
															| 
															 | 
															
																 
															 | 
															
															 | 
															
																+    asyncio.run(main(context)) 
															 |