Browse Source

rag prompt template added

Kai Wu 1 year ago
parent
commit
d5b67ab4e7

+ 1 - 1
recipes/use_cases/end2end-recipes/raft/README.md

@@ -125,7 +125,7 @@ CUDA_VISIBLE_DEVICES=2,3 python -m vllm.entrypoints.openai.api_server  --model m
 Then we can pass the port to the eval script:
 Then we can pass the port to the eval script:
 
 
 ```bash
 ```bash
-python eval_raft.py -m raft-8b -v 8000 -j 8002
+CUDA_VISIBLE_DEVICES=4 python eval_raft.py -m raft-8b -v 8000 -j 8002
 ```
 ```
 
 
 
 

+ 14 - 18
recipes/use_cases/end2end-recipes/raft/eval_config.yaml

@@ -1,28 +1,24 @@
 eval_prompt_template: >
 eval_prompt_template: >
   You are a AI assistant that skilled in answering questions related to Llama language models,
   You are a AI assistant that skilled in answering questions related to Llama language models,
   which includes LLama, Llama2, Meta Llama3, Code Llama, Meta Llama Guard 1,	Meta Llama Guard 2,
   which includes LLama, Llama2, Meta Llama3, Code Llama, Meta Llama Guard 1,	Meta Llama Guard 2,
-  Below is a question from a llama user, think step by step and then answer it in {language}, make the answer as concise as possible, it should be at most 100 words.
-  Return the result with the template:
-  [
-    {{
-      "Question": "The question user asked to you"
-      "Answer": "Your answer to the question"
-  }}
-  ]
+  Below is a question from a llama user, think step by step, make the answer as concise as possible,
+  The returned answer should be no more than 100 words.Please return the answers in text directly without any special tokens.
+
 judge_prompt_template: >
 judge_prompt_template: >
-  You are provided with a question, a teacher's answer and a student's answer. Given that question, you need to score the how good the student answer is compare to
-  the teacher's answer. If the student's answer is correct based on the teacher's answer, then return YES. If the answer is not faithful, then return NO
-  and explain which part of the student's answer is not faithful in the Reason section.
-  Return the result in json format with the template:
-    {{
-      "Reason": "your reason here.",
-      "Result": "YES or NO."
-    }}
+  You have been provided with a question, a teacher's answer and a student's answer above. Given that question, you need to score the how good the student answer is compare to
+  the teacher's answer. If the student's answer is correct based on the teacher's answer, then return YES, else return NO.
+  Review it carefully to make sure that the keywords and numerical vaules are exactly the same.
+  Only respond with "YES" or "NO", do not respond with anything else.
 
 
+RAG_prompt_template: >
+  Question: {question}\n Context: {context}\n
+  Answer this question using the information given in the context above. Here is things to pay attention to:
+    - First provide step-by-step reasoning on how to answer the question.
+    - In the reasoning, if you need to copy paste some sentences from the context, include them in ##begin_quote## and ##end_quote##. This would mean that things outside of ##begin_quote## and ##end_quote## are not directly copy paste from the context.
+    - End your response with final answer in the form <ANSWER>: $answer, the answer should be succinct.
+  You MUST begin your final answer with the tag "<ANSWER>:
 eval_json: "./evalset.json"
 eval_json: "./evalset.json"
 
 
-language: "English"
-
 raft_model_name: "raft-8b"
 raft_model_name: "raft-8b"
 
 
 base_model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
 base_model_name: "meta-llama/Meta-Llama-3-8B-Instruct"

+ 68 - 51
recipes/use_cases/end2end-recipes/raft/eval_raft.py

@@ -12,59 +12,79 @@ from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.vectorstores import FAISS
 from langchain_community.vectorstores import FAISS
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.document_loaders import DirectoryLoader
 from langchain_community.document_loaders import DirectoryLoader
-from langchain.chains import RetrievalQA
+from langchain_core.runnables import RunnablePassthrough
+
 from langchain_core.messages import HumanMessage, SystemMessage
 from langchain_core.messages import HumanMessage, SystemMessage
 import re
 import re
 import string
 import string
 from collections import Counter
 from collections import Counter
+from langchain_core.output_parsers import StrOutputParser
+from langchain.prompts.prompt import PromptTemplate
 
 
 def generate_answers_model_only(model_name,question_list,api_url="http://localhost:8000/v1",key="EMPTY"):
 def generate_answers_model_only(model_name,question_list,api_url="http://localhost:8000/v1",key="EMPTY"):
         # Use langchain to load the documents from data directory
         # Use langchain to load the documents from data directory
     # Load the RAFT model
     # Load the RAFT model
+
     llm = VLLMOpenAI(
     llm = VLLMOpenAI(
         openai_api_key=key,
         openai_api_key=key,
         openai_api_base=api_url,
         openai_api_base=api_url,
         model_name=model_name,
         model_name=model_name,
-        model_kwargs={"stop": ["."]},
-        temperature=0.0,)
+        temperature=0.0,
+        max_tokens=100
+        )
+    system_prompt = SystemMessage(content=context['eval_prompt_template'])
     generated_answers = []
     generated_answers = []
-    for question in question_list:
-        response = llm.invoke(question)
-        generated_answers.append(response)
+    all_tasks = [[system_prompt, HumanMessage(content=question)] for question in question_list]
+    generated_answers = llm.batch(all_tasks)
     if len(generated_answers) == 0:
     if len(generated_answers) == 0:
         logging.error("No model answers generated. Please check the input context or model configuration in ",model_name)
         logging.error("No model answers generated. Please check the input context or model configuration in ",model_name)
         return []
         return []
-    return generated_answers
-def generate_answers_with_RAG(model_name, data_dir,question_list, api_url="http://localhost:8000/v1",key="EMPTY"):
+    return clean_text_list(generated_answers)
+def format_docs_raft(docs):
+    context = ""
+    for doc in docs:
+        context += "<DOCUMENT>" + str(doc.page_content) + "</DOCUMENT>\n"
+    return context
+def format_docs(docs):
+    return "\n\n".join(doc.page_content for doc in docs)
+def generate_answers_with_RAG(model_name, data_dir,question_list,rag_template,api_url="http://localhost:8000/v1",key="EMPTY"):
     # Use langchain to load the documents from data directory
     # Use langchain to load the documents from data directory
     loader = DirectoryLoader(data_dir)
     loader = DirectoryLoader(data_dir)
     docs = loader.load()
     docs = loader.load()
     # Split the document into chunks with a specified chunk size
     # Split the document into chunks with a specified chunk size
-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
     all_splits = text_splitter.split_documents(docs)
     all_splits = text_splitter.split_documents(docs)
 
 
     # Store the document into a vector store with a specific embedding model
     # Store the document into a vector store with a specific embedding model
-    vectorstore = FAISS.from_documents(all_splits, HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2"))
+    vectorstore = FAISS.from_documents(all_splits, HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2",model_kwargs={'device': 'cuda'}))
+    retriever = vectorstore.as_retriever(
+        search_kwargs={"k": 5}
+    )
     # Load the RAFT model
     # Load the RAFT model
     llm = VLLMOpenAI(
     llm = VLLMOpenAI(
         openai_api_key=key,
         openai_api_key=key,
         openai_api_base=api_url,
         openai_api_base=api_url,
         model_name=model_name,
         model_name=model_name,
-        model_kwargs={"stop": ["."]},
-        temperature=0.0,)
-    # Create a RetrievalQA chain with the vector store and RAFT model
-    qa_chain = RetrievalQA.from_chain_type(
-    llm,
-    retriever=vectorstore.as_retriever()
-    )
-    generated_answers = []
-    for question in question_list:
-        response = qa_chain({"query": question})
-        generated_answers.append(response['result'])
+        temperature=0.0,
+        max_tokens=100
+        )
+    all_tasks = []
+    for q in question_list:
+        # retrive the top 6 documents
+        retrieved_docs = retriever.invoke(q)
+        # format the documents into a string
+        if '8B-Instruct' in model_name:
+            documents = format_docs(retrieved_docs)
+        else:
+            documents = format_docs_raft(retrieved_docs)
+        # create a prompt
+        text = rag_template.format(context=documents,question=q)
+        all_tasks.append(text)
+    generated_answers = llm.batch(all_tasks)
     if len(generated_answers) == 0:
     if len(generated_answers) == 0:
         logging.error("No RAG answers generated. Please check the input context or model configuration in ",model_name)
         logging.error("No RAG answers generated. Please check the input context or model configuration in ",model_name)
         return []
         return []
-    return generated_answers
+    return clean_text_list(generated_answers)
 def compute_rouge_score(generated : list, reference: list):
 def compute_rouge_score(generated : list, reference: list):
     rouge_score = evaluate.load('rouge')
     rouge_score = evaluate.load('rouge')
     return rouge_score.compute(
     return rouge_score.compute(
@@ -73,14 +93,19 @@ def compute_rouge_score(generated : list, reference: list):
         use_stemmer=True,
         use_stemmer=True,
         use_aggregator=True
         use_aggregator=True
     )
     )
-def remove_special_tokens(text_list):
-    clean_text_list = []
+def clean_text_list(text_list):
+    result = []
     for text in text_list:
     for text in text_list:
-        text = text.replace("##begin_quote##","")
-        text = text.replace("##end_quote##","")
+        # for raft model, the answer will started with <ANSWER>
+        index = text.rfind("<ANSWER>")
+        if index!= -1:
+            text = text[index:]
+        text = text.replace("begin_quote","")
+        text = text.replace("end_quote","")
+        text = text.replace("##","")
         text = text.strip()
         text = text.strip()
-        clean_text_list.append(text)
-    return clean_text_list
+        result.append(text)
+    return result
 
 
 def normalize_answer(s):
 def normalize_answer(s):
 
 
@@ -125,25 +150,20 @@ def compute_judge_score(questions: list, generated : list, reference: list, cont
         openai_api_key=key,
         openai_api_key=key,
         openai_api_base=api_url,
         openai_api_base=api_url,
         model_name=model_name,
         model_name=model_name,
-        model_kwargs={"stop": ["."]},
-        temperature=0.0,)
+        temperature=0.0)
+    all_tasks = []
     for q,pred,gold in zip(questions, generated,reference):
     for q,pred,gold in zip(questions, generated,reference):
-        # messages = [
-        #     SystemMessage(content=context['judge_prompt_template']),
-        #     HumanMessage(content=f"Question: {q} \n Teacher's Answer: {gold} \n Student's Answer: {pred} "),
-        # ]
-        messages = context['judge_prompt_template'] + "\n"
-        messages += f"Question: {q} \n Teacher's Answer: {gold} \n Student's Answer: {pred} "
-        response = llm.invoke(messages)
-        print(response+ " -------------")
-        result = json.loads(response)
-        if "Result" not in result:
-            print("Error: eval response does not contain answer")
-            print(result)
-            continue
-        correct_num += result["Result"] == "YES"
+        messages = [
+            HumanMessage(content=f"Question: {q} \n Teacher's Answer: {gold} \n Student's Answer: {pred} "),
+            SystemMessage(content=context['judge_prompt_template'])
+        ]
+        all_tasks.append(messages)
+    response = llm.batch(all_tasks)
+    for response in response:
+        if  "YES" in response:
+            correct_num += 1
     return correct_num/len(questions)
     return correct_num/len(questions)
-def score_single(context,generated,reference,questions, run_exact_match=True,run_rouge=True, run_bert=True, run_llm_as_judge=True):
+def score_single(context,generated,reference,questions, run_exact_match=True,run_rouge=True, run_bert=True, run_llm_as_judge=False):
     # set metric to default -1, means no metric is computed
     # set metric to default -1, means no metric is computed
     metric = {
     metric = {
         "Rouge_score": -1,
         "Rouge_score": -1,
@@ -192,15 +212,12 @@ def main(context):
         }
         }
         # Generate answers for baseline
         # Generate answers for baseline
         base_model_name = context["base_model_name"]
         base_model_name = context["base_model_name"]
-        generated_answers["Baseline"] = generate_answers_model_only(base_model_name,questions,api_url)
-        #generated_answers["Baseline_RAG"] = generate_answers_with_RAG(base_model_name, context["data_dir"],questions,api_url)
+        #generated_answers["Baseline"] = generate_answers_model_only(base_model_name,questions,api_url)
+        generated_answers["Baseline_RAG"] = generate_answers_with_RAG(base_model_name, context["data_dir"],questions,context['RAG_prompt_template'],api_url)
         # Generate answers for RAFT
         # Generate answers for RAFT
         raft_model_name = context["raft_model_name"]
         raft_model_name = context["raft_model_name"]
         #generated_answers["RAFT"] = generate_answers_model_only(raft_model_name,questions,api_url)
         #generated_answers["RAFT"] = generate_answers_model_only(raft_model_name,questions,api_url)
-        #generated_answers["RAFT_RAG"] = generate_answers_with_RAG(raft_model_name, context["data_dir"],questions,api_url)
-        # clean special tokens from the RAFT generated answer
-        #generated_answers["RAFT"] = remove_special_tokens(generated_answers["RAFT"])
-        #generated_answers["RAFT_RAG"] = remove_special_tokens(generated_answers["RAFT_RAG"])
+        generated_answers["RAFT_RAG"] = generate_answers_with_RAG(raft_model_name, context["data_dir"],questions,context['RAG_prompt_template'],api_url)
         logging.info(f"Successfully generated {len(generated_answers['Baseline_RAG'])} answers for all models.")
         logging.info(f"Successfully generated {len(generated_answers['Baseline_RAG'])} answers for all models.")
         # for generate answer from each model, compute the score metric
         # for generate answer from each model, compute the score metric
         for model_name,model_answer in generated_answers.items():
         for model_name,model_answer in generated_answers.items():

File diff suppressed because it is too large
+ 57 - 37
recipes/use_cases/end2end-recipes/raft/evalset.json


+ 0 - 1
recipes/use_cases/end2end-recipes/raft/raft.py

@@ -36,7 +36,6 @@ async def main(context):
                 logging.info(f"Question: {question}")
                 logging.info(f"Question: {question}")
         logging.info(f"Successfully generated {sum([len(q) for c,q in chunk_questions_zip])} question/answer pairs.")
         logging.info(f"Successfully generated {sum([len(q) for c,q in chunk_questions_zip])} question/answer pairs.")
         ds = await add_chunk_to_dataset(chunk_questions_zip,context, chat_service,ds,NUM_DISTRACT_DOCS, ORCALE_P)
         ds = await add_chunk_to_dataset(chunk_questions_zip,context, chat_service,ds,NUM_DISTRACT_DOCS, ORCALE_P)
-        print(ds[0])
         ds.save_to_disk(args.output)
         ds.save_to_disk(args.output)
         logging.info(f"Data successfully written to {context['output']}. Process completed.")
         logging.info(f"Data successfully written to {context['output']}. Process completed.")
         formatter = DatasetConverter()
         formatter = DatasetConverter()