2 лет назад · d5b67ab4e7
--- a/recipes/use_cases/end2end-recipes/raft/README.md
+++ b/recipes/use_cases/end2end-recipes/raft/README.md
@@ -125,7 +125,7 @@ CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server  --model m
 
				 Then we can pass the port to the eval script:
			
 
				 
			
 
				 ```bash
			
 
				-python eval_raft.py -m raft-8b -v 8000 -j 8002
			
 
				+CUDA_VISIBLE_DEVICES=4 python eval_raft.py -m raft-8b -v 8000 -j 8002
			
 
				 ```
			
 
				 
			
 
				 
			
--- a/recipes/use_cases/end2end-recipes/raft/eval_config.yaml
+++ b/recipes/use_cases/end2end-recipes/raft/eval_config.yaml
@@ -1,28 +1,24 @@
 
				 eval_prompt_template: >
			
 
				   You are a AI assistant that skilled in answering questions related to Llama language models,
			
 
				   which includes LLama, Llama2, Meta Llama3, Code Llama, Meta Llama Guard 1,	Meta Llama Guard 2,
			
 
				-  Below is a question from a llama user, think step by step and then answer it in {language}, make the answer as concise as possible, it should be at most 100 words.
			
 
				-  Return the result with the template:
			
 
				-  [
			
 
				-    {{
			
 
				-      "Question": "The question user asked to you"
			
 
				-      "Answer": "Your answer to the question"
			
 
				-  }}
			
 
				-  ]
			
 
				+  Below is a question from a llama user, think step by step, make the answer as concise as possible,
			
 
				+  The returned answer should be no more than 100 words.Please return the answers in text directly without any special tokens.
			
 
				+
			
 
				 judge_prompt_template: >
			
 
				-  You are provided with a question, a teacher's answer and a student's answer. Given that question, you need to score the how good the student answer is compare to
			
 
				-  the teacher's answer. If the student's answer is correct based on the teacher's answer, then return YES. If the answer is not faithful, then return NO
			
 
				-  and explain which part of the student's answer is not faithful in the Reason section.
			
 
				-  Return the result in json format with the template:
			
 
				-    {{
			
 
				-      "Reason": "your reason here.",
			
 
				-      "Result": "YES or NO."
			
 
				-    }}
			
 
				+  You have been provided with a question, a teacher's answer and a student's answer above. Given that question, you need to score the how good the student answer is compare to
			
 
				+  the teacher's answer. If the student's answer is correct based on the teacher's answer, then return YES, else return NO.
			
 
				+  Review it carefully to make sure that the keywords and numerical vaules are exactly the same.
			
 
				+  Only respond with "YES" or "NO", do not respond with anything else.
			
 
				 
			
 
				+RAG_prompt_template: >
			
 
				+  Question: {question}\n Context: {context}\n
			
 
				+  Answer this question using the information given in the context above. Here is things to pay attention to:
			
 
				+    - First provide step-by-step reasoning on how to answer the question.
			
 
				+    - In the reasoning, if you need to copy paste some sentences from the context, include them in ##begin_quote## and ##end_quote##. This would mean that things outside of ##begin_quote## and ##end_quote## are not directly copy paste from the context.
			
 
				+    - End your response with final answer in the form <ANSWER>: $answer, the answer should be succinct.
			
 
				+  You MUST begin your final answer with the tag "<ANSWER>:
			
 
				 eval_json: "./evalset.json"
			
 
				 
			
 
				-language: "English"
			
 
				-
			
 
				 raft_model_name: "raft-8b"
			
 
				 
			
 
				 base_model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
			
--- a/recipes/use_cases/end2end-recipes/raft/eval_raft.py
+++ b/recipes/use_cases/end2end-recipes/raft/eval_raft.py
@@ -12,59 +12,79 @@ from langchain_community.embeddings import HuggingFaceEmbeddings
 
				 from langchain_community.vectorstores import FAISS
			
 
				 from langchain.text_splitter import RecursiveCharacterTextSplitter
			
 
				 from langchain_community.document_loaders import DirectoryLoader
			
 
				-from langchain.chains import RetrievalQA
			
 
				+from langchain_core.runnables import RunnablePassthrough
			
 
				+
			
 
				 from langchain_core.messages import HumanMessage, SystemMessage
			
 
				 import re
			
 
				 import string
			
 
				 from collections import Counter
			
 
				+from langchain_core.output_parsers import StrOutputParser
			
 
				+from langchain.prompts.prompt import PromptTemplate
			
 
				 
			
 
				 def generate_answers_model_only(model_name,question_list,api_url="http://localhost:8000/v1",key="EMPTY"):
			
 
				         # Use langchain to load the documents from data directory
			
 
				     # Load the RAFT model
			
 
				+
			
 
				     llm = VLLMOpenAI(
			
 
				         openai_api_key=key,
			
 
				         openai_api_base=api_url,
			
 
				         model_name=model_name,
			
 
				-        model_kwargs={"stop": ["."]},
			
 
				-        temperature=0.0,)
			
 
				+        temperature=0.0,
			
 
				+        max_tokens=100
			
 
				+        )
			
 
				+    system_prompt = SystemMessage(content=context['eval_prompt_template'])
			
 
				     generated_answers = []
			
 
				-    for question in question_list:
			
 
				-        response = llm.invoke(question)
			
 
				-        generated_answers.append(response)
			
 
				+    all_tasks = [[system_prompt, HumanMessage(content=question)] for question in question_list]
			
 
				+    generated_answers = llm.batch(all_tasks)
			
 
				     if len(generated_answers) == 0:
			
 
				         logging.error("No model answers generated. Please check the input context or model configuration in ",model_name)
			
 
				         return []
			
 
				-    return generated_answers
			
 
				-def generate_answers_with_RAG(model_name, data_dir,question_list, api_url="http://localhost:8000/v1",key="EMPTY"):
			
 
				+    return clean_text_list(generated_answers)
			
 
				+def format_docs_raft(docs):
			
 
				+    context = ""
			
 
				+    for doc in docs:
			
 
				+        context += "<DOCUMENT>" + str(doc.page_content) + "</DOCUMENT>\n"
			
 
				+    return context
			
 
				+def format_docs(docs):
			
 
				+    return "\n\n".join(doc.page_content for doc in docs)
			
 
				+def generate_answers_with_RAG(model_name, data_dir,question_list,rag_template,api_url="http://localhost:8000/v1",key="EMPTY"):
			
 
				     # Use langchain to load the documents from data directory
			
 
				     loader = DirectoryLoader(data_dir)
			
 
				     docs = loader.load()
			
 
				     # Split the document into chunks with a specified chunk size
			
 
				-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
			
 
				+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
			
 
				     all_splits = text_splitter.split_documents(docs)
			
 
				 
			
 
				     # Store the document into a vector store with a specific embedding model
			
 
				-    vectorstore = FAISS.from_documents(all_splits, HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2"))
			
 
				+    vectorstore = FAISS.from_documents(all_splits, HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2",model_kwargs={'device': 'cuda'}))
			
 
				+    retriever = vectorstore.as_retriever(
			
 
				+        search_kwargs={"k": 5}
			
 
				+    )
			
 
				     # Load the RAFT model
			
 
				     llm = VLLMOpenAI(
			
 
				         openai_api_key=key,
			
 
				         openai_api_base=api_url,
			
 
				         model_name=model_name,
			
 
				-        model_kwargs={"stop": ["."]},
			
 
				-        temperature=0.0,)
			
 
				-    # Create a RetrievalQA chain with the vector store and RAFT model
			
 
				-    qa_chain = RetrievalQA.from_chain_type(
			
 
				-    llm,
			
 
				-    retriever=vectorstore.as_retriever()
			
 
				-    )
			
 
				-    generated_answers = []
			
 
				-    for question in question_list:
			
 
				-        response = qa_chain({"query": question})
			
 
				-        generated_answers.append(response['result'])
			
 
				+        temperature=0.0,
			
 
				+        max_tokens=100
			
 
				+        )
			
 
				+    all_tasks = []
			
 
				+    for q in question_list:
			
 
				+        # retrive the top 6 documents
			
 
				+        retrieved_docs = retriever.invoke(q)
			
 
				+        # format the documents into a string
			
 
				+        if '8B-Instruct' in model_name:
			
 
				+            documents = format_docs(retrieved_docs)
			
 
				+        else:
			
 
				+            documents = format_docs_raft(retrieved_docs)
			
 
				+        # create a prompt
			
 
				+        text = rag_template.format(context=documents,question=q)
			
 
				+        all_tasks.append(text)
			
 
				+    generated_answers = llm.batch(all_tasks)
			
 
				     if len(generated_answers) == 0:
			
 
				         logging.error("No RAG answers generated. Please check the input context or model configuration in ",model_name)
			
 
				         return []
			
 
				-    return generated_answers
			
 
				+    return clean_text_list(generated_answers)
			
 
				 def compute_rouge_score(generated : list, reference: list):
			
 
				     rouge_score = evaluate.load('rouge')
			
 
				     return rouge_score.compute(
			
@@ -73,14 +93,19 @@ def compute_rouge_score(generated : list, reference: list):
 
				         use_stemmer=True,
			
 
				         use_aggregator=True
			
 
				     )
			
 
				-def remove_special_tokens(text_list):
			
 
				-    clean_text_list = []
			
 
				+def clean_text_list(text_list):
			
 
				+    result = []
			
 
				     for text in text_list:
			
 
				-        text = text.replace("##begin_quote##","")
			
 
				-        text = text.replace("##end_quote##","")
			
 
				+        # for raft model, the answer will started with <ANSWER>
			
 
				+        index = text.rfind("<ANSWER>")
			
 
				+        if index!= -1:
			
 
				+            text = text[index:]
			
 
				+        text = text.replace("begin_quote","")
			
 
				+        text = text.replace("end_quote","")
			
 
				+        text = text.replace("##","")
			
 
				         text = text.strip()
			
 
				-        clean_text_list.append(text)
			
 
				-    return clean_text_list
			
 
				+        result.append(text)
			
 
				+    return result
			
 
				 
			
 
				 def normalize_answer(s):
			
 
				 
			
@@ -125,25 +150,20 @@ def compute_judge_score(questions: list, generated : list, reference: list, cont
 
				         openai_api_key=key,
			
 
				         openai_api_base=api_url,
			
 
				         model_name=model_name,
			
 
				-        model_kwargs={"stop": ["."]},
			
 
				-        temperature=0.0,)
			
 
				+        temperature=0.0)
			
 
				+    all_tasks = []
			
 
				     for q,pred,gold in zip(questions, generated,reference):
			
 
				-        # messages = [
			
 
				-        #     SystemMessage(content=context['judge_prompt_template']),
			
 
				-        #     HumanMessage(content=f"Question: {q} \n Teacher's Answer: {gold} \n Student's Answer: {pred} "),
			
 
				-        # ]
			
 
				-        messages = context['judge_prompt_template'] + "\n"
			
 
				-        messages += f"Question: {q} \n Teacher's Answer: {gold} \n Student's Answer: {pred} "
			
 
				-        response = llm.invoke(messages)
			
 
				-        print(response+ " -------------")
			
 
				-        result = json.loads(response)
			
 
				-        if "Result" not in result:
			
 
				-            print("Error: eval response does not contain answer")
			
 
				-            print(result)
			
 
				-            continue
			
 
				-        correct_num += result["Result"] == "YES"
			
 
				+        messages = [
			
 
				+            HumanMessage(content=f"Question: {q} \n Teacher's Answer: {gold} \n Student's Answer: {pred} "),
			
 
				+            SystemMessage(content=context['judge_prompt_template'])
			
 
				+        ]
			
 
				+        all_tasks.append(messages)
			
 
				+    response = llm.batch(all_tasks)
			
 
				+    for response in response:
			
 
				+        if  "YES" in response:
			
 
				+            correct_num += 1
			
 
				     return correct_num/len(questions)
			
 
				-def score_single(context,generated,reference,questions, run_exact_match=True,run_rouge=True, run_bert=True, run_llm_as_judge=True):
			
 
				+def score_single(context,generated,reference,questions, run_exact_match=True,run_rouge=True, run_bert=True, run_llm_as_judge=False):
			
 
				     # set metric to default -1, means no metric is computed
			
 
				     metric = {
			
 
				         "Rouge_score": -1,
			
@@ -192,15 +212,12 @@ def main(context):
 
				         }
			
 
				         # Generate answers for baseline
			
 
				         base_model_name = context["base_model_name"]
			
 
				-        generated_answers["Baseline"] = generate_answers_model_only(base_model_name,questions,api_url)
			
 
				-        #generated_answers["Baseline_RAG"] = generate_answers_with_RAG(base_model_name, context["data_dir"],questions,api_url)
			
 
				+        #generated_answers["Baseline"] = generate_answers_model_only(base_model_name,questions,api_url)
			
 
				+        generated_answers["Baseline_RAG"] = generate_answers_with_RAG(base_model_name, context["data_dir"],questions,context['RAG_prompt_template'],api_url)
			
 
				         # Generate answers for RAFT
			
 
				         raft_model_name = context["raft_model_name"]
			
 
				         #generated_answers["RAFT"] = generate_answers_model_only(raft_model_name,questions,api_url)
			
 
				-        #generated_answers["RAFT_RAG"] = generate_answers_with_RAG(raft_model_name, context["data_dir"],questions,api_url)
			
 
				-        # clean special tokens from the RAFT generated answer
			
 
				-        #generated_answers["RAFT"] = remove_special_tokens(generated_answers["RAFT"])
			
 
				-        #generated_answers["RAFT_RAG"] = remove_special_tokens(generated_answers["RAFT_RAG"])
			
 
				+        generated_answers["RAFT_RAG"] = generate_answers_with_RAG(raft_model_name, context["data_dir"],questions,context['RAG_prompt_template'],api_url)
			
 
				         logging.info(f"Successfully generated {len(generated_answers['Baseline_RAG'])} answers for all models.")
			
 
				         # for generate answer from each model, compute the score metric
			
 
				         for model_name,model_answer in generated_answers.items():
			
--- a/recipes/use_cases/end2end-recipes/raft/evalset.json
+++ b/recipes/use_cases/end2end-recipes/raft/evalset.json
--- a/recipes/use_cases/end2end-recipes/raft/raft.py
+++ b/recipes/use_cases/end2end-recipes/raft/raft.py
@@ -36,7 +36,6 @@ async def main(context):
 
				                 logging.info(f"Question: {question}")
			
 
				         logging.info(f"Successfully generated {sum([len(q) for c,q in chunk_questions_zip])} question/answer pairs.")
			
 
				         ds = await add_chunk_to_dataset(chunk_questions_zip,context, chat_service,ds,NUM_DISTRACT_DOCS, ORCALE_P)
			
 
				-        print(ds[0])
			
 
				         ds.save_to_disk(args.output)
			
 
				         logging.info(f"Data successfully written to {context['output']}. Process completed.")
			
 
				         formatter = DatasetConverter()