hace 2 años · d5b67ab4e7
--- a/recipes/use_cases/end2end-recipes/raft/README.md
+++ b/recipes/use_cases/end2end-recipes/raft/README.md
@@ -125,7 +125,7 @@ CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server  --model m
 
																 Then we can pass the port to the eval script:
															
 
																 ```bash
															
 
																-python eval_raft.py -m raft-8b -v 8000 -j 8002
															
 
																+CUDA_VISIBLE_DEVICES=4 python eval_raft.py -m raft-8b -v 8000 -j 8002
															
 
																 ```
															
--- a/recipes/use_cases/end2end-recipes/raft/eval_config.yaml
+++ b/recipes/use_cases/end2end-recipes/raft/eval_config.yaml
@@ -1,28 +1,24 @@
 
																 eval_prompt_template: >
															
 
																   You are a AI assistant that skilled in answering questions related to Llama language models,
															
 
																   which includes LLama, Llama2, Meta Llama3, Code Llama, Meta Llama Guard 1,	Meta Llama Guard 2,
															
 
																-  Below is a question from a llama user, think step by step and then answer it in {language}, make the answer as concise as possible, it should be at most 100 words.
															
 
																-  Return the result with the template:
															
 
																-  [
															
 
																-    {{
															
 
																-      "Question": "The question user asked to you"
															
 
																-      "Answer": "Your answer to the question"
															
 
																-  }}
															
 
																-  ]
															
 
																+  Below is a question from a llama user, think step by step, make the answer as concise as possible,
															
 
																+  The returned answer should be no more than 100 words.Please return the answers in text directly without any special tokens.
															
 
																+
															
 
																 judge_prompt_template: >
															
 
																-  You are provided with a question, a teacher's answer and a student's answer. Given that question, you need to score the how good the student answer is compare to
															
 
																-  the teacher's answer. If the student's answer is correct based on the teacher's answer, then return YES. If the answer is not faithful, then return NO
															
 
																-  and explain which part of the student's answer is not faithful in the Reason section.
															
 
																-  Return the result in json format with the template:
															
 
																-    {{
															
 
																-      "Reason": "your reason here.",
															
 
																-      "Result": "YES or NO."
															
 
																-    }}
															
 
																+  You have been provided with a question, a teacher's answer and a student's answer above. Given that question, you need to score the how good the student answer is compare to
															
 
																+  the teacher's answer. If the student's answer is correct based on the teacher's answer, then return YES, else return NO.
															
 
																+  Review it carefully to make sure that the keywords and numerical vaules are exactly the same.
															
 
																+  Only respond with "YES" or "NO", do not respond with anything else.
															
 
																+RAG_prompt_template: >
															
 
																+  Question: {question}\n Context: {context}\n
															
 
																+  Answer this question using the information given in the context above. Here is things to pay attention to:
															
 
																+    - First provide step-by-step reasoning on how to answer the question.
															
 
																+    - In the reasoning, if you need to copy paste some sentences from the context, include them in ##begin_quote## and ##end_quote##. This would mean that things outside of ##begin_quote## and ##end_quote## are not directly copy paste from the context.
															
 
																+    - End your response with final answer in the form <ANSWER>: $answer, the answer should be succinct.
															
 
																+  You MUST begin your final answer with the tag "<ANSWER>:
															
 
																 eval_json: "./evalset.json"
															
 
																-language: "English"
															
 
																-
															
 
																 raft_model_name: "raft-8b"
															
 
																 base_model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
															
--- a/recipes/use_cases/end2end-recipes/raft/eval_raft.py
+++ b/recipes/use_cases/end2end-recipes/raft/eval_raft.py
@@ -12,59 +12,79 @@ from langchain_community.embeddings import HuggingFaceEmbeddings
 
																 from langchain_community.vectorstores import FAISS
															
 
																 from langchain.text_splitter import RecursiveCharacterTextSplitter
															
 
																 from langchain_community.document_loaders import DirectoryLoader
															
 
																-from langchain.chains import RetrievalQA
															
 
																+from langchain_core.runnables import RunnablePassthrough
															
 
																+
															
 
																 from langchain_core.messages import HumanMessage, SystemMessage
															
 
																 import re
															
 
																 import string
															
 
																 from collections import Counter
															
 
																+from langchain_core.output_parsers import StrOutputParser
															
 
																+from langchain.prompts.prompt import PromptTemplate
															
 
																 def generate_answers_model_only(model_name,question_list,api_url="http://localhost:8000/v1",key="EMPTY"):
															
 
																         # Use langchain to load the documents from data directory
															
 
																     # Load the RAFT model
															
 
																+
															
 
																     llm = VLLMOpenAI(
															
 
																         openai_api_key=key,
															
 
																         openai_api_base=api_url,
															
 
																         model_name=model_name,
															
 
																-        model_kwargs={"stop": ["."]},
															
 
																-        temperature=0.0,)
															
 
																+        temperature=0.0,
															
 
																+        max_tokens=100
															
 
																+        )
															
 
																+    system_prompt = SystemMessage(content=context['eval_prompt_template'])
															
 
																     generated_answers = []
															
 
																-    for question in question_list:
															
 
																-        response = llm.invoke(question)
															
 
																-        generated_answers.append(response)
															
 
																+    all_tasks = [[system_prompt, HumanMessage(content=question)] for question in question_list]
															
 
																+    generated_answers = llm.batch(all_tasks)
															
 
																     if len(generated_answers) == 0:
															
 
																         logging.error("No model answers generated. Please check the input context or model configuration in ",model_name)
															
 
																         return []
															
 
																-    return generated_answers
															
 
																-def generate_answers_with_RAG(model_name, data_dir,question_list, api_url="http://localhost:8000/v1",key="EMPTY"):
															
 
																+    return clean_text_list(generated_answers)
															
 
																+def format_docs_raft(docs):
															
 
																+    context = ""
															
 
																+    for doc in docs:
															
 
																+        context += "<DOCUMENT>" + str(doc.page_content) + "</DOCUMENT>\n"
															
 
																+    return context
															
 
																+def format_docs(docs):
															
 
																+    return "\n\n".join(doc.page_content for doc in docs)
															
 
																+def generate_answers_with_RAG(model_name, data_dir,question_list,rag_template,api_url="http://localhost:8000/v1",key="EMPTY"):
															
 
																     # Use langchain to load the documents from data directory
															
 
																     loader = DirectoryLoader(data_dir)
															
 
																     docs = loader.load()
															
 
																     # Split the document into chunks with a specified chunk size
															
 
																-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
															
 
																+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
															
 
																     all_splits = text_splitter.split_documents(docs)
															
 
																     # Store the document into a vector store with a specific embedding model
															
 
																-    vectorstore = FAISS.from_documents(all_splits, HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2"))
															
 
																+    vectorstore = FAISS.from_documents(all_splits, HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2",model_kwargs={'device': 'cuda'}))
															
 
																+    retriever = vectorstore.as_retriever(
															
 
																+        search_kwargs={"k": 5}
															
 
																+    )
															
 
																     # Load the RAFT model
															
 
																     llm = VLLMOpenAI(
															
 
																         openai_api_key=key,
															
 
																         openai_api_base=api_url,
															
 
																         model_name=model_name,
															
 
																-        model_kwargs={"stop": ["."]},
															
 
																-        temperature=0.0,)
															
 
																-    # Create a RetrievalQA chain with the vector store and RAFT model
															
 
																-    qa_chain = RetrievalQA.from_chain_type(
															
 
																-    llm,
															
 
																-    retriever=vectorstore.as_retriever()
															
 
																-    )
															
 
																-    generated_answers = []
															
 
																-    for question in question_list:
															
 
																-        response = qa_chain({"query": question})
															
 
																-        generated_answers.append(response['result'])
															
 
																+        temperature=0.0,
															
 
																+        max_tokens=100
															
 
																+        )
															
 
																+    all_tasks = []
															
 
																+    for q in question_list:
															
 
																+        # retrive the top 6 documents
															
 
																+        retrieved_docs = retriever.invoke(q)
															
 
																+        # format the documents into a string
															
 
																+        if '8B-Instruct' in model_name:
															
 
																+            documents = format_docs(retrieved_docs)
															
 
																+        else:
															
 
																+            documents = format_docs_raft(retrieved_docs)
															
 
																+        # create a prompt
															
 
																+        text = rag_template.format(context=documents,question=q)
															
 
																+        all_tasks.append(text)
															
 
																+    generated_answers = llm.batch(all_tasks)
															
 
																     if len(generated_answers) == 0:
															
 
																         logging.error("No RAG answers generated. Please check the input context or model configuration in ",model_name)
															
 
																         return []
															
 
																-    return generated_answers
															
 
																+    return clean_text_list(generated_answers)
															
 
																 def compute_rouge_score(generated : list, reference: list):
															
 
																     rouge_score = evaluate.load('rouge')
															
 
																     return rouge_score.compute(
															
@@ -73,14 +93,19 @@ def compute_rouge_score(generated : list, reference: list):
 
																         use_stemmer=True,
															
 
																         use_aggregator=True
															
 
																     )
															
 
																-def remove_special_tokens(text_list):
															
 
																-    clean_text_list = []
															
 
																+def clean_text_list(text_list):
															
 
																+    result = []
															
 
																     for text in text_list:
															
 
																-        text = text.replace("##begin_quote##","")
															
 
																-        text = text.replace("##end_quote##","")
															
 
																+        # for raft model, the answer will started with <ANSWER>
															
 
																+        index = text.rfind("<ANSWER>")
															
 
																+        if index!= -1:
															
 
																+            text = text[index:]
															
 
																+        text = text.replace("begin_quote","")
															
 
																+        text = text.replace("end_quote","")
															
 
																+        text = text.replace("##","")
															
 
																         text = text.strip()
															
 
																-        clean_text_list.append(text)
															
 
																-    return clean_text_list
															
 
																+        result.append(text)
															
 
																+    return result
															
 
																 def normalize_answer(s):
															
@@ -125,25 +150,20 @@ def compute_judge_score(questions: list, generated : list, reference: list, cont
 
																         openai_api_key=key,
															
 
																         openai_api_base=api_url,
															
 
																         model_name=model_name,
															
 
																-        model_kwargs={"stop": ["."]},
															
 
																-        temperature=0.0,)
															
 
																+        temperature=0.0)
															
 
																+    all_tasks = []
															
 
																     for q,pred,gold in zip(questions, generated,reference):
															
 
																-        # messages = [
															
 
																-        #     SystemMessage(content=context['judge_prompt_template']),
															
 
																-        #     HumanMessage(content=f"Question: {q} \n Teacher's Answer: {gold} \n Student's Answer: {pred} "),
															
 
																-        # ]
															
 
																-        messages = context['judge_prompt_template'] + "\n"
															
 
																-        messages += f"Question: {q} \n Teacher's Answer: {gold} \n Student's Answer: {pred} "
															
 
																-        response = llm.invoke(messages)
															
 
																-        print(response+ " -------------")
															
 
																-        result = json.loads(response)
															
 
																-        if "Result" not in result:
															
 
																-            print("Error: eval response does not contain answer")
															
 
																-            print(result)
															
 
																-            continue
															
 
																-        correct_num += result["Result"] == "YES"
															
 
																+        messages = [
															
 
																+            HumanMessage(content=f"Question: {q} \n Teacher's Answer: {gold} \n Student's Answer: {pred} "),
															
 
																+            SystemMessage(content=context['judge_prompt_template'])
															
 
																+        ]
															
 
																+        all_tasks.append(messages)
															
 
																+    response = llm.batch(all_tasks)
															
 
																+    for response in response:
															
 
																+        if  "YES" in response:
															
 
																+            correct_num += 1
															
 
																     return correct_num/len(questions)
															
 
																-def score_single(context,generated,reference,questions, run_exact_match=True,run_rouge=True, run_bert=True, run_llm_as_judge=True):
															
 
																+def score_single(context,generated,reference,questions, run_exact_match=True,run_rouge=True, run_bert=True, run_llm_as_judge=False):
															
 
																     # set metric to default -1, means no metric is computed
															
 
																     metric = {
															
 
																         "Rouge_score": -1,
															
@@ -192,15 +212,12 @@ def main(context):
 
																         }
															
 
																         # Generate answers for baseline
															
 
																         base_model_name = context["base_model_name"]
															
 
																-        generated_answers["Baseline"] = generate_answers_model_only(base_model_name,questions,api_url)
															
 
																-        #generated_answers["Baseline_RAG"] = generate_answers_with_RAG(base_model_name, context["data_dir"],questions,api_url)
															
 
																+        #generated_answers["Baseline"] = generate_answers_model_only(base_model_name,questions,api_url)
															
 
																+        generated_answers["Baseline_RAG"] = generate_answers_with_RAG(base_model_name, context["data_dir"],questions,context['RAG_prompt_template'],api_url)
															
 
																         # Generate answers for RAFT
															
 
																         raft_model_name = context["raft_model_name"]
															
 
																         #generated_answers["RAFT"] = generate_answers_model_only(raft_model_name,questions,api_url)
															
 
																-        #generated_answers["RAFT_RAG"] = generate_answers_with_RAG(raft_model_name, context["data_dir"],questions,api_url)
															
 
																-        # clean special tokens from the RAFT generated answer
															
 
																-        #generated_answers["RAFT"] = remove_special_tokens(generated_answers["RAFT"])
															
 
																-        #generated_answers["RAFT_RAG"] = remove_special_tokens(generated_answers["RAFT_RAG"])
															
 
																+        generated_answers["RAFT_RAG"] = generate_answers_with_RAG(raft_model_name, context["data_dir"],questions,context['RAG_prompt_template'],api_url)
															
 
																         logging.info(f"Successfully generated {len(generated_answers['Baseline_RAG'])} answers for all models.")
															
 
																         # for generate answer from each model, compute the score metric
															
 
																         for model_name,model_answer in generated_answers.items():
															
--- a/recipes/use_cases/end2end-recipes/raft/evalset.json
+++ b/recipes/use_cases/end2end-recipes/raft/evalset.json
--- a/recipes/use_cases/end2end-recipes/raft/raft.py
+++ b/recipes/use_cases/end2end-recipes/raft/raft.py
@@ -36,7 +36,6 @@ async def main(context):
 
																                 logging.info(f"Question: {question}")
															
 
																         logging.info(f"Successfully generated {sum([len(q) for c,q in chunk_questions_zip])} question/answer pairs.")
															
 
																         ds = await add_chunk_to_dataset(chunk_questions_zip,context, chat_service,ds,NUM_DISTRACT_DOCS, ORCALE_P)
															
 
																-        print(ds[0])
															
 
																         ds.save_to_disk(args.output)
															
 
																         logging.info(f"Data successfully written to {context['output']}. Process completed.")
															
 
																         formatter = DatasetConverter()