Browse Source

rag prompt template added

Kai Wu 1 year ago
parent
commit
d5b67ab4e7

+ 1 - 1
recipes/use_cases/end2end-recipes/raft/README.md

@@ -125,7 +125,7 @@ CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server  --model m
 Then we can pass the port to the eval script:
 
 ```bash
-python eval_raft.py -m raft-8b -v 8000 -j 8002
+CUDA_VISIBLE_DEVICES=4 python eval_raft.py -m raft-8b -v 8000 -j 8002
 ```
 
 

+ 14 - 18
recipes/use_cases/end2end-recipes/raft/eval_config.yaml

@@ -1,28 +1,24 @@
 eval_prompt_template: >
   You are a AI assistant that skilled in answering questions related to Llama language models,
   which includes LLama, Llama2, Meta Llama3, Code Llama, Meta Llama Guard 1,	Meta Llama Guard 2,
-  Below is a question from a llama user, think step by step and then answer it in {language}, make the answer as concise as possible, it should be at most 100 words.
-  Return the result with the template:
-  [
-    {{
-      "Question": "The question user asked to you"
-      "Answer": "Your answer to the question"
-  }}
-  ]
+  Below is a question from a llama user, think step by step, make the answer as concise as possible,
+  The returned answer should be no more than 100 words.Please return the answers in text directly without any special tokens.
+
 judge_prompt_template: >
-  You are provided with a question, a teacher's answer and a student's answer. Given that question, you need to score the how good the student answer is compare to
-  the teacher's answer. If the student's answer is correct based on the teacher's answer, then return YES. If the answer is not faithful, then return NO
-  and explain which part of the student's answer is not faithful in the Reason section.
-  Return the result in json format with the template:
-    {{
-      "Reason": "your reason here.",
-      "Result": "YES or NO."
-    }}
+  You have been provided with a question, a teacher's answer and a student's answer above. Given that question, you need to score the how good the student answer is compare to
+  the teacher's answer. If the student's answer is correct based on the teacher's answer, then return YES, else return NO.
+  Review it carefully to make sure that the keywords and numerical vaules are exactly the same.
+  Only respond with "YES" or "NO", do not respond with anything else.
 
+RAG_prompt_template: >
+  Question: {question}\n Context: {context}\n
+  Answer this question using the information given in the context above. Here is things to pay attention to:
+    - First provide step-by-step reasoning on how to answer the question.
+    - In the reasoning, if you need to copy paste some sentences from the context, include them in ##begin_quote## and ##end_quote##. This would mean that things outside of ##begin_quote## and ##end_quote## are not directly copy paste from the context.
+    - End your response with final answer in the form <ANSWER>: $answer, the answer should be succinct.
+  You MUST begin your final answer with the tag "<ANSWER>:
 eval_json: "./evalset.json"
 
-language: "English"
-
 raft_model_name: "raft-8b"
 
 base_model_name: "meta-llama/Meta-Llama-3-8B-Instruct"

+ 68 - 51
recipes/use_cases/end2end-recipes/raft/eval_raft.py

@@ -12,59 +12,79 @@ from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.vectorstores import FAISS
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.document_loaders import DirectoryLoader
-from langchain.chains import RetrievalQA
+from langchain_core.runnables import RunnablePassthrough
+
 from langchain_core.messages import HumanMessage, SystemMessage
 import re
 import string
 from collections import Counter
+from langchain_core.output_parsers import StrOutputParser
+from langchain.prompts.prompt import PromptTemplate
 
 def generate_answers_model_only(model_name,question_list,api_url="http://localhost:8000/v1",key="EMPTY"):
         # Use langchain to load the documents from data directory
     # Load the RAFT model
+
     llm = VLLMOpenAI(
         openai_api_key=key,
         openai_api_base=api_url,
         model_name=model_name,
-        model_kwargs={"stop": ["."]},
-        temperature=0.0,)
+        temperature=0.0,
+        max_tokens=100
+        )
+    system_prompt = SystemMessage(content=context['eval_prompt_template'])
     generated_answers = []
-    for question in question_list:
-        response = llm.invoke(question)
-        generated_answers.append(response)
+    all_tasks = [[system_prompt, HumanMessage(content=question)] for question in question_list]
+    generated_answers = llm.batch(all_tasks)
     if len(generated_answers) == 0:
         logging.error("No model answers generated. Please check the input context or model configuration in ",model_name)
         return []
-    return generated_answers
-def generate_answers_with_RAG(model_name, data_dir,question_list, api_url="http://localhost:8000/v1",key="EMPTY"):
+    return clean_text_list(generated_answers)
+def format_docs_raft(docs):
+    context = ""
+    for doc in docs:
+        context += "<DOCUMENT>" + str(doc.page_content) + "</DOCUMENT>\n"
+    return context
+def format_docs(docs):
+    return "\n\n".join(doc.page_content for doc in docs)
+def generate_answers_with_RAG(model_name, data_dir,question_list,rag_template,api_url="http://localhost:8000/v1",key="EMPTY"):
     # Use langchain to load the documents from data directory
     loader = DirectoryLoader(data_dir)
     docs = loader.load()
     # Split the document into chunks with a specified chunk size
-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
     all_splits = text_splitter.split_documents(docs)
 
     # Store the document into a vector store with a specific embedding model
-    vectorstore = FAISS.from_documents(all_splits, HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2"))
+    vectorstore = FAISS.from_documents(all_splits, HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2",model_kwargs={'device': 'cuda'}))
+    retriever = vectorstore.as_retriever(
+        search_kwargs={"k": 5}
+    )
     # Load the RAFT model
     llm = VLLMOpenAI(
         openai_api_key=key,
         openai_api_base=api_url,
         model_name=model_name,
-        model_kwargs={"stop": ["."]},
-        temperature=0.0,)
-    # Create a RetrievalQA chain with the vector store and RAFT model
-    qa_chain = RetrievalQA.from_chain_type(
-    llm,
-    retriever=vectorstore.as_retriever()
-    )
-    generated_answers = []
-    for question in question_list:
-        response = qa_chain({"query": question})
-        generated_answers.append(response['result'])
+        temperature=0.0,
+        max_tokens=100
+        )
+    all_tasks = []
+    for q in question_list:
+        # retrive the top 6 documents
+        retrieved_docs = retriever.invoke(q)
+        # format the documents into a string
+        if '8B-Instruct' in model_name:
+            documents = format_docs(retrieved_docs)
+        else:
+            documents = format_docs_raft(retrieved_docs)
+        # create a prompt
+        text = rag_template.format(context=documents,question=q)
+        all_tasks.append(text)
+    generated_answers = llm.batch(all_tasks)
     if len(generated_answers) == 0:
         logging.error("No RAG answers generated. Please check the input context or model configuration in ",model_name)
         return []
-    return generated_answers
+    return clean_text_list(generated_answers)
 def compute_rouge_score(generated : list, reference: list):
     rouge_score = evaluate.load('rouge')
     return rouge_score.compute(
@@ -73,14 +93,19 @@ def compute_rouge_score(generated : list, reference: list):
         use_stemmer=True,
         use_aggregator=True
     )
-def remove_special_tokens(text_list):
-    clean_text_list = []
+def clean_text_list(text_list):
+    result = []
     for text in text_list:
-        text = text.replace("##begin_quote##","")
-        text = text.replace("##end_quote##","")
+        # for raft model, the answer will started with <ANSWER>
+        index = text.rfind("<ANSWER>")
+        if index!= -1:
+            text = text[index:]
+        text = text.replace("begin_quote","")
+        text = text.replace("end_quote","")
+        text = text.replace("##","")
         text = text.strip()
-        clean_text_list.append(text)
-    return clean_text_list
+        result.append(text)
+    return result
 
 def normalize_answer(s):
 
@@ -125,25 +150,20 @@ def compute_judge_score(questions: list, generated : list, reference: list, cont
         openai_api_key=key,
         openai_api_base=api_url,
         model_name=model_name,
-        model_kwargs={"stop": ["."]},
-        temperature=0.0,)
+        temperature=0.0)
+    all_tasks = []
     for q,pred,gold in zip(questions, generated,reference):
-        # messages = [
-        #     SystemMessage(content=context['judge_prompt_template']),
-        #     HumanMessage(content=f"Question: {q} \n Teacher's Answer: {gold} \n Student's Answer: {pred} "),
-        # ]
-        messages = context['judge_prompt_template'] + "\n"
-        messages += f"Question: {q} \n Teacher's Answer: {gold} \n Student's Answer: {pred} "
-        response = llm.invoke(messages)
-        print(response+ " -------------")
-        result = json.loads(response)
-        if "Result" not in result:
-            print("Error: eval response does not contain answer")
-            print(result)
-            continue
-        correct_num += result["Result"] == "YES"
+        messages = [
+            HumanMessage(content=f"Question: {q} \n Teacher's Answer: {gold} \n Student's Answer: {pred} "),
+            SystemMessage(content=context['judge_prompt_template'])
+        ]
+        all_tasks.append(messages)
+    response = llm.batch(all_tasks)
+    for response in response:
+        if  "YES" in response:
+            correct_num += 1
     return correct_num/len(questions)
-def score_single(context,generated,reference,questions, run_exact_match=True,run_rouge=True, run_bert=True, run_llm_as_judge=True):
+def score_single(context,generated,reference,questions, run_exact_match=True,run_rouge=True, run_bert=True, run_llm_as_judge=False):
     # set metric to default -1, means no metric is computed
     metric = {
         "Rouge_score": -1,
@@ -192,15 +212,12 @@ def main(context):
         }
         # Generate answers for baseline
         base_model_name = context["base_model_name"]
-        generated_answers["Baseline"] = generate_answers_model_only(base_model_name,questions,api_url)
-        #generated_answers["Baseline_RAG"] = generate_answers_with_RAG(base_model_name, context["data_dir"],questions,api_url)
+        #generated_answers["Baseline"] = generate_answers_model_only(base_model_name,questions,api_url)
+        generated_answers["Baseline_RAG"] = generate_answers_with_RAG(base_model_name, context["data_dir"],questions,context['RAG_prompt_template'],api_url)
         # Generate answers for RAFT
         raft_model_name = context["raft_model_name"]
         #generated_answers["RAFT"] = generate_answers_model_only(raft_model_name,questions,api_url)
-        #generated_answers["RAFT_RAG"] = generate_answers_with_RAG(raft_model_name, context["data_dir"],questions,api_url)
-        # clean special tokens from the RAFT generated answer
-        #generated_answers["RAFT"] = remove_special_tokens(generated_answers["RAFT"])
-        #generated_answers["RAFT_RAG"] = remove_special_tokens(generated_answers["RAFT_RAG"])
+        generated_answers["RAFT_RAG"] = generate_answers_with_RAG(raft_model_name, context["data_dir"],questions,context['RAG_prompt_template'],api_url)
         logging.info(f"Successfully generated {len(generated_answers['Baseline_RAG'])} answers for all models.")
         # for generate answer from each model, compute the score metric
         for model_name,model_answer in generated_answers.items():

File diff suppressed because it is too large
+ 57 - 37
recipes/use_cases/end2end-recipes/raft/evalset.json


+ 0 - 1
recipes/use_cases/end2end-recipes/raft/raft.py

@@ -36,7 +36,6 @@ async def main(context):
                 logging.info(f"Question: {question}")
         logging.info(f"Successfully generated {sum([len(q) for c,q in chunk_questions_zip])} question/answer pairs.")
         ds = await add_chunk_to_dataset(chunk_questions_zip,context, chat_service,ds,NUM_DISTRACT_DOCS, ORCALE_P)
-        print(ds[0])
         ds.save_to_disk(args.output)
         logging.info(f"Data successfully written to {context['output']}. Process completed.")
         formatter = DatasetConverter()