eval_raft.py 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220
  1. # Copyright (c) Meta Platforms, Inc. and affiliates.
  2. # This software may be used and distributed according to the terms of the Llama 3 Community License Agreement.
  3. from chat_utils import OctoAIChatService, VllmChatService
  4. import logging
  5. import evaluate
  6. import argparse
  7. from config import load_config
  8. import asyncio
  9. import json
  10. from itertools import chain
  11. from generator_utils import parse_qa_to_json, generate_LLM_eval
  12. from langchain_community.llms import VLLM
  13. from langchain_community.embeddings import HuggingFaceEmbeddings
  14. from langchain_community.vectorstores import FAISS
  15. from langchain.text_splitter import RecursiveCharacterTextSplitter
  16. from langchain_community.document_loaders import DirectoryLoader
  17. from langchain.chains import RetrievalQA
  18. from eval_utils import exact_match_score
  19. def generate_answers_model_only(model_path):
  20. # Use langchain to load the documents from data directory
  21. # Load the RAFT model
  22. llm = VLLM(model=model_path,
  23. trust_remote_code=True, # mandatory for hf models
  24. max_new_tokens=500,
  25. top_p=1,
  26. temperature=0.0,
  27. # tensor_parallel_size=... # for distributed inference
  28. )
  29. generated_answers = []
  30. for question in question_list:
  31. result = llm.invoke(question)
  32. generated_answers.append(result["answer"])
  33. return generated_answers
  34. def generate_answers_with_RAG(model_path, data_dir,question_list):
  35. # Use langchain to load the documents from data directory
  36. loader = DirectoryLoader(data_dir)
  37. docs = loader.load()
  38. # Split the document into chunks with a specified chunk size
  39. text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
  40. all_splits = text_splitter.split_documents(docs)
  41. # Store the document into a vector store with a specific embedding model
  42. vectorstore = FAISS.from_documents(all_splits, HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2"))
  43. # Load the RAFT model
  44. llm = VLLM(model=model_path,
  45. trust_remote_code=True, # mandatory for hf models
  46. max_new_tokens=500,
  47. top_p=1,
  48. temperature=0.0,
  49. # tensor_parallel_size=... # for distributed inference
  50. )
  51. # Create a RetrievalQA chain with the vector store and RAFT model
  52. qa_chain = RetrievalQA.from_chain_type(
  53. llm,
  54. retriever=vectorstore.as_retriever()
  55. )
  56. generated_answers = []
  57. for question in question_list:
  58. result = qa_chain({"query": question})
  59. generated_answers.append(result["answer"])
  60. return generated_answers
  61. def compute_rouge_score(generated : str, reference: str):
  62. rouge_score = evaluate.load('rouge')
  63. return rouge_score.compute(
  64. predictions=generated,
  65. references=reference,
  66. use_stemmer=True,
  67. use_aggregator=True
  68. )
  69. def compute_bert_score(generated : str, reference: str):
  70. bertscore = evaluate.load("bertscore")
  71. score = bertscore.compute(
  72. predictions=generated,
  73. references=reference,
  74. lang="en"
  75. )
  76. f1 = score["f1"]
  77. precision = score["precision"]
  78. recall = score["recall"]
  79. return sum(precision)/len(precision), sum(recall)/len(recall), sum(f1)/len(f1)
  80. # This function is used to eval the fine-tuned model, given the question, generate the answer.
  81. async def eval_request(chat_service, api_context: dict, question: str) -> dict:
  82. prompt_for_system = api_context['eval_prompt_template'].format(language=api_context["language"])
  83. chat_request_payload = [{'role': 'system', 'content': prompt_for_system}, {'role': 'user', 'content': f"Question: {question}"}]
  84. # Getting a list of result, in this case, there should be only one result
  85. response_string = await chat_service.execute_chat_request_async(api_context, chat_request_payload)
  86. # convert the result string to a dict that contains Question, Answer
  87. result_list = parse_qa_to_json(response_string)
  88. if not result_list or len(result_list) > 1:
  89. print("Error: eval response should be a list of one result dict")
  90. return {}
  91. result = result_list[0]
  92. if "Answer" not in result:
  93. print("Error: eval response does not contain answer")
  94. return {}
  95. # Send back the model generated answer
  96. return result["Answer"]
  97. async def generate_eval_answer(chat_service, api_context: dict, questions: list):
  98. eval_tasks = []
  99. for batch_index, question in enumerate(questions):
  100. try:
  101. result = eval_request(chat_service, api_context, question)
  102. eval_tasks.append(result)
  103. except Exception as e:
  104. print(f"Error during data eval request execution: {e}")
  105. print(len(eval_tasks),"eval_tasks")
  106. eval_results = await asyncio.gather(*eval_tasks)
  107. return eval_results
  108. async def main(context):
  109. if context["endpoint"]:
  110. chat_service = VllmChatService()
  111. else:
  112. chat_service = OctoAIChatService()
  113. try:
  114. logging.info("Starting to generate answer given the eval set.")
  115. with open(context["eval_json"]) as fp:
  116. eval_json = json.load(fp)
  117. questions,groud_truth = [],[]
  118. for index, item in enumerate(eval_json):
  119. questions.append(item["question"])
  120. groud_truth.append(item["answer"])
  121. generated_answers = generate_answers_with_RAG(model_path, context,questions)
  122. if not generated_answers:
  123. logging.warning("No answers generated. Please check the input context or model configuration.")
  124. return
  125. logging.info(f"Successfully generated {len(generated_answers)} answers.")
  126. judge_list = []
  127. for index, item in enumerate(generated_answers):
  128. judge_list.append({"Question":questions[index],"Ground_truth":groud_truth[index],"Generated_answer":generated_answers[index]})
  129. if context["judge_endpoint"]:
  130. # make a copy of the context then change the VLLM endpoint to judge_endpoint
  131. context_copy = dict(context)
  132. context_copy["endpoint"] = context["judge_endpoint"]
  133. context_copy["model"] = "meta-llama/Meta-Llama-3-70B-Instruct"
  134. judge_results = await generate_LLM_eval(chat_service, context_copy, judge_list)
  135. correct_num = 0
  136. for result in judge_results:
  137. correct_num += result["Result"] == "YES"
  138. LLM_judge_score = correct_num/len(judge_results)
  139. print(f"The accuracy of the model is {LLM_judge_score}")
  140. rouge_score = compute_rouge_score(generated_answers,groud_truth)
  141. print("Rouge_score:",rouge_score)
  142. P, R, F1 = compute_bert_score(generated_answers,groud_truth)
  143. print(f"BERTScore Precision: {P:.4f}, Recall: {R:.4f}, F1: {F1:.4f}")
  144. exact_match = 0
  145. for item in judge_list:
  146. exact_match += exact_match_score(item['Generated_answer'],item['Ground_truth'])
  147. exact_match_percentage = exact_match/len(judge_list)
  148. print(f"Exact_match_percentage: {exact_match_percentage:.4f}")
  149. # Saving the eval result to a log file
  150. with open(context["output_log"],"a") as fp:
  151. fp.write(f"Eval_result for {context['model']} \n")
  152. fp.write(f"Rouge_score: {rouge_score} \n")
  153. fp.write(f"BERTScore Precision: {P:.4f}, Recall: {R:.4f}, F1: {F1:.4f} \n")
  154. fp.write(f"Exact_match_percentage: {exact_match_percentage} \n")
  155. if context["judge_endpoint"]:
  156. fp.write(f"LLM_judge_score: {LLM_judge_score} \n")
  157. fp.write(f"QA details: \n")
  158. for item in judge_list:
  159. fp.write(f"question: {item['Question']} \n")
  160. fp.write(f"generated_answers: {item['Generated_answer']} \n")
  161. fp.write(f"groud_truth: {item['Ground_truth']} \n")
  162. fp.write("\n")
  163. logging.info(f"Eval successfully, the eval result is saved to {context['output_log']}.")
  164. except Exception as e:
  165. logging.error(f"An unexpected error occurred during the process: {e}",exc_info=True)
  166. def parse_arguments():
  167. # Define command line arguments for the script
  168. parser = argparse.ArgumentParser(
  169. description="Generate question/answer pairs from documentation."
  170. )
  171. parser.add_argument(
  172. "-m", "--model",
  173. default="chatbot",
  174. help="Select the model to use for evaluation, this maybe a LoRA adapter."
  175. )
  176. parser.add_argument(
  177. "-c", "--config_path",
  178. default="eval_config.yaml",
  179. help="Set the configuration file path that has system prompt along with language, evalset path."
  180. )
  181. parser.add_argument(
  182. "-v", "--vllm_endpoint",
  183. default=None,
  184. type=int,
  185. help="If a port is specified, then use local vllm endpoint for evaluations."
  186. )
  187. parser.add_argument(
  188. "-j", "--judge_endpoint",
  189. default=None,
  190. type=int,
  191. help="If a port is specified, then use local vllm endpoint as judge LLM."
  192. )
  193. parser.add_argument(
  194. "-o", "--output_log",
  195. default="eval_result.log",
  196. help="save the eval result to a log file. Default is eval_result.log"
  197. )
  198. return parser.parse_args()
  199. if __name__ == "__main__":
  200. logging.info("Initializing the process and loading configuration...")
  201. args = parse_arguments()
  202. context = load_config(args.config_path)
  203. context["model"] = args.model
  204. context["endpoint"] = args.vllm_endpoint
  205. context["judge_endpoint"] = args.judge_endpoint
  206. context["output_log"] = args.output_log
  207. if context["endpoint"]:
  208. logging.info(f"Use local vllm service for eval at port: '{args.vllm_endpoint}'.")
  209. if context["judge_endpoint"]:
  210. logging.info(f"Use local vllm service for judge at port: '{args.judge_endpoint}'.")
  211. asyncio.run(main(context))