2 gadi atpakaļ · af53ee051e
--- a/recipes/use_cases/end2end-recipes/raft/README.md
+++ b/recipes/use_cases/end2end-recipes/raft/README.md
--- a/recipes/use_cases/end2end-recipes/raft/raft.py
+++ b/recipes/use_cases/end2end-recipes/raft/raft.py
@@ -1,7 +1,4 @@
 
																 import logging
															
 
																-from typing import Literal, Any
															
 
																-import json
															
 
																-import random
															
 
																 import os
															
 
																 import argparse
															
 
																 from raft_utils import generate_questions, add_chunk_to_dataset
															
@@ -10,8 +7,6 @@ from config import load_config
 
																 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
															
 
																-NUM_DISTRACT_DOCS = 5 # number of distracting documents to add to each chunk
															
 
																-ORCALE_P = 0.8 # probability of related documents to be added to each chunk
															
 
																 def main(api_config):
															
 
																     ds = None
															
 
																     try:
															
@@ -26,7 +21,7 @@ def main(api_config):
 
																             for question in questions:
															
 
																                 logging.info(f"Question: {question}")
															
 
																         logging.info(f"Successfully generated {sum([len(q) for c,q in chunk_questions_zip])} question/answer pairs.")
															
 
																-        ds = add_chunk_to_dataset(chunk_questions_zip,api_config,ds,NUM_DISTRACT_DOCS, ORCALE_P)
															
 
																+        ds = add_chunk_to_dataset(chunk_questions_zip,api_config,ds)
															
 
																         ds.save_to_disk(args.output)
															
 
																         logging.info(f"Data successfully written to {api_config['output']}. Process completed.")
															
 
																         formatter = DatasetConverter()
															
@@ -92,6 +87,7 @@ if __name__ == "__main__":
 
																         api_config["api_key"] = os.environ["API_KEY"]
															
 
																     logging.info(f"Configuration loaded. Generating {args.questions_per_chunk} question per chunk using model '{args.model}'.")
															
 
																     logging.info(f"Chunk size: {args.chunk_size}.")
															
 
																+    logging.info(f"num_distract_docs: {api_config['num_distract_docs']}, orcale_p: {api_config['orcale_p']}")
															
 
																     logging.info(f"Will use endpoint_url: {args.endpoint_url}.")
															
 
																     logging.info(f"Output will be written to {args.output}.")
															
 
																     main(api_config)
															
--- a/recipes/use_cases/end2end-recipes/raft/raft.yaml
+++ b/recipes/use_cases/end2end-recipes/raft/raft.yaml
@@ -1,40 +1,43 @@
 
																 COT_prompt_template: >
															
 
																-  <|begin_of_text|><|start_header_id|>system<|end_header_id|> Answer the following question using the information given in the context below. Here is things to pay attention to:
															
 
																-    - First provide step-by-step reasoning on how to answer the question.
															
 
																-    - In the reasoning, if you need to copy paste some sentences from the context, include them in ##begin_quote## and ##end_quote##. This would mean that things outside of ##begin_quote## and ##end_quote## are not directly copy paste from the context.
															
 
																-    - End your response with final answer in the form <ANSWER>: $answer, the answer should less than 60 words.
															
 
																-    You MUST begin your final answer with the tag "<ANSWER>: <|eot_id|>
															
 
																+  <|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a helpful chatbot who can provide an answer to every questions from the user given a relevant context.<|eot_id|>
															
 
																   <|start_header_id|>user<|end_header_id|>
															
 
																-  Question: {question}\nContext: {context}\n<|eot_id|><|start_header_id|>assistant<|end_header_id|>
															
 
																-
															
 
																-# question_prompt_template: >
															
 
																-#   <|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a synthetic question-answer pair generator. Given a chunk of context about
															
 
																-#   some topic(s), generate {num_questions} example questions a user could ask and would be answered
															
 
																-#   using information from the chunk. For example, if the given context was a Wikipedia
															
 
																-#   paragraph about the United States, an example question could be 'How many states are
															
 
																-#   in the United States?
															
 
																-#   The questions should be able to be answered in 100 words or less. Include only the
															
 
																-#   questions in your response.<|eot_id|>
															
 
																-#   <|start_header_id|>user<|end_header_id|>
															
 
																-#   Context: {context}\n <|eot_id|><|start_header_id|>assistant<|end_header_id|>
															
 
																+  Question: {question}\nContext: {context}\n
															
 
																+  Answer this question using the information given by multiple documents in the context above. Here is things to pay attention to:
															
 
																+  - The context contains many documents, each document starts with <DOCUMENT> and ends </DOCUMENT>.
															
 
																+  - First provide step-by-step reasoning on how to answer the question.
															
 
																+  - In the reasoning, if you need to copy paste some sentences from the context, include them in ##begin_quote## and ##end_quote##. This would mean that things outside of ##begin_quote## and ##end_quote## are not directly copy paste from the context.
															
 
																+  - End your response with final answer in the form <ANSWER>: $answer, the answer should less than 60 words.
															
 
																+  You MUST begin your final answer with the tag "<ANSWER> <|eot_id|><|start_header_id|>assistant<|end_header_id|>
															
 
																 question_prompt_template: >
															
 
																-  <|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a language model skilled in creating quiz questions.
															
 
																-  You will be provided with a document,
															
 
																-  read it and please generate factoid question and answer pairs that are most likely be asked by a user of Llama language models
															
 
																-  which includes LLama, Llama2, Meta Llama3, Code Llama, Meta Llama Guard 1,	Meta Llama Guard 2
															
 
																-  Your factoid questions should be answerable with a specific, concise piece of factual information from the context.
															
 
																-  Your factoid questions should be formulated in the same style as questions users could ask in a search engine.
															
 
																-  This means that your factoid questions MUST NOT mention something like "according to the passage" or "context".
															
 
																-  please make sure you follow those rules:
															
 
																-  1. Generate {num_questions} question answer pairs, you can generate less answer if there is nothing related to
															
 
																-  model, training, fine-tuning and evaluation details of Llama language models,
															
 
																-  2. The questions can be answered based *solely* on the given passage.
															
 
																-  3. Avoid asking questions with similar meaning.
															
 
																-  4. Never use any abbreviation.
															
 
																-  5. The questions should be able to be answered in 60 words or less. Include only the questions in your response. <|eot_id|>
															
 
																+  <|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a synthetic question-answer pair generator. Given a chunk of context about
															
 
																+  some topic(s), generate {num_questions} example questions a user could ask and would be answered
															
 
																+  using information from the chunk. For example, if the given context was a Wikipedia
															
 
																+  paragraph about the United States, an example question could be 'How many states are
															
 
																+  in the United States?
															
 
																+  Your questions should be formulated in the same style as questions that users could ask in a search engine.
															
 
																+  This means that your questions MUST NOT mention something like "according to the passage" or "context".
															
 
																+  The questions should be able to be answered in 60 words or less. Include only the questions in your response.<|eot_id|>
															
 
																   <|start_header_id|>user<|end_header_id|>
															
 
																   Context: {context}\n <|eot_id|><|start_header_id|>assistant<|end_header_id|>
															
 
																+
															
 
																+# question_prompt_template: >
															
 
																+#   <|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a language model skilled in creating quiz questions.
															
 
																+#   You will be provided with a document,
															
 
																+#   read it and please generate factoid question and answer pairs that are most likely be asked by a user of Llama language models
															
 
																+#   which includes LLama, Llama2, Meta Llama3, Code Llama, Meta Llama Guard 1,	Meta Llama Guard 2
															
 
																+#   Your factoid questions should be answerable with a specific, concise piece of factual information from the context.
															
 
																+#   Your factoid questions should be formulated in the same style as questions users could ask in a search engine.
															
 
																+#   This means that your factoid questions MUST NOT mention something like "according to the passage" or "context".
															
 
																+#   please make sure you follow those rules:
															
 
																+#   1. Generate {num_questions} question answer pairs, you can generate less answer if there is nothing related to
															
 
																+#   model, training, fine-tuning and evaluation details of Llama language models,
															
 
																+#   2. The questions can be answered based *solely* on the given passage.
															
 
																+#   3. Avoid asking questions with similar meaning.
															
 
																+#   4. Never use any abbreviation.
															
 
																+#   5. The questions should be able to be answered in 60 words or less. Include only the questions in your response. <|eot_id|>
															
 
																+#   <|start_header_id|>user<|end_header_id|>
															
 
																+#   Context: {context}\n <|eot_id|><|start_header_id|>assistant<|end_header_id|>
															
 
																 data_dir: "./data"
															
 
																 xml_path: ""
															
--- a/recipes/use_cases/end2end-recipes/raft/raft_eval.py
+++ b/recipes/use_cases/end2end-recipes/raft/raft_eval.py
@@ -8,7 +8,7 @@ import json
 
																 from langchain_openai import ChatOpenAI
															
 
																 from langchain_community.embeddings import HuggingFaceEmbeddings
															
 
																 from langchain_community.vectorstores import FAISS
															
 
																-from langchain.text_splitter import RecursiveCharacterTextSplitter,TokenTextSplitter
															
 
																+from langchain.text_splitter import RecursiveCharacterTextSplitter
															
 
																 from langchain_community.vectorstores.utils import DistanceStrategy
															
 
																 from datetime import datetime
															
 
																 from langchain_community.document_loaders import DirectoryLoader
															
--- a/recipes/use_cases/end2end-recipes/raft/raft_eval_config.yaml
+++ b/recipes/use_cases/end2end-recipes/raft/raft_eval_config.yaml
@@ -2,7 +2,7 @@ eval_prompt_template: >
 
																   <|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a AI assistant that skilled in answering questions related to Llama language models,
															
 
																   which includes LLama, Llama2, Meta Llama3, Code Llama, Meta Llama Guard 1,	Meta Llama Guard 2,
															
 
																   Below is a question from a llama user, please the answer it with best of your knowledge,
															
 
																-  The returned answer should be no more than 100 words.Please return the answers in text directly without any special tokens.<|eot_id|>
															
 
																+  The returned answer should be no more than 60 words. Please return the answers in text directly without any special tokens.<|eot_id|>
															
 
																   <|start_header_id|>user<|end_header_id|>
															
 
																   Question:{question} \n <|eot_id|><|start_header_id|>assistant<|end_header_id|>
															
 
																 judge_prompt_template: >
															
@@ -19,14 +19,15 @@ judge_prompt_template: >
 
																     <|start_header_id|>user<|end_header_id|>
															
 
																     Question: {question} \n Teacher's Answer: {gold} \n Student's Answer: {prediction} <|eot_id|><|start_header_id|>assistant<|end_header_id|>
															
 
																 RAG_prompt_template: >
															
 
																-  <|begin_of_text|><|start_header_id|>system<|end_header_id|> Answer the following question using the information given in the context below. Here is things to pay attention to:
															
 
																-    1.The context contains many documents, each document starts with <DOCUMENT> and ends </DOCUMENT>.
															
 
																-    2.First provide step-by-step reasoning on how to answer the question.
															
 
																-    3.In the reasoning, if you need to copy paste some sentences from the context, include them in ##begin_quote## and ##end_quote##. This would mean that things outside of ##begin_quote## and ##end_quote## are not directly copy paste from the context.
															
 
																-    4.End your response with final answer in the form <ANSWER>: $answer, the answer should less than 60 words.
															
 
																-    You MUST begin your final answer with the tag "<ANSWER>:<|eot_id|>
															
 
																+  <|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a helpful chatbot who can provide an answer to every questions from the user given a relevant context.<|eot_id|>
															
 
																   <|start_header_id|>user<|end_header_id|>
															
 
																-  Question: {question}\nContext: {context}\n<|eot_id|><|start_header_id|>assistant<|end_header_id|>
															
 
																+  Question: {question}\nContext: {context}\n
															
 
																+  Answer this question using the information given by multiple documents in the context above. Here is things to pay attention to:
															
 
																+  - The context contains many documents, each document starts with <DOCUMENT> and ends </DOCUMENT>.
															
 
																+  - First provide step-by-step reasoning on how to answer the question.
															
 
																+  - In the reasoning, if you need to copy paste some sentences from the context, include them in ##begin_quote## and ##end_quote##. This would mean that things outside of ##begin_quote## and ##end_quote## are not directly copy paste from the context.
															
 
																+  - End your response with final answer in the form <ANSWER>: $answer, the answer should less than 60 words.
															
 
																+  You MUST begin your final answer with the tag "<ANSWER> <|eot_id|><|start_header_id|>assistant<|end_header_id|>
															
 
																 eval_file: "./eval_llama.json"
															
 
																 model_name: "raft-8b"
															
--- a/recipes/use_cases/end2end-recipes/raft/raft_utils.py
+++ b/recipes/use_cases/end2end-recipes/raft/raft_utils.py
@@ -9,7 +9,7 @@ from datasets import Dataset
 
																 import random
															
 
																 from langchain_community.document_loaders import SitemapLoader,DirectoryLoader
															
 
																 from bs4 import BeautifulSoup
															
 
																-
															
 
																+import copy
															
 
																 from langchain_openai import ChatOpenAI
															
@@ -171,12 +171,12 @@ def add_chunk_to_dataset(
 
																     chunk_questions_zip: list,
															
 
																     api_config: dict,
															
 
																     ds,
															
 
																-    num_distract: int = 3,
															
 
																-    p: float = 0.8,
															
 
																 ) -> None:
															
 
																     """
															
 
																     Given a chunk and related questions lists, create {Q, A, D} triplets and add them to the dataset.
															
 
																     """
															
 
																+    num_distract = api_config["num_distract_docs"]
															
 
																+    p = api_config["oracle_p"]
															
 
																     chunks = [chunk for chunk, _ in chunk_questions_zip]
															
 
																     COT_results = generate_COT(chunk_questions_zip,api_config)
															
 
																     for chunk, q , cot in COT_results:
															
@@ -198,12 +198,8 @@ def add_chunk_to_dataset(
 
																         indices.remove(i)
															
 
																         for j in random.sample(indices, num_distract):
															
 
																             docs.append(chunks[j])
															
 
																-        # decides whether to add oracle document
															
 
																-        oracle = random.uniform(0, 1) < p
															
 
																-        if not oracle:
															
 
																-            docs[0] = chunks[random.sample(indices, 1)[0]]
															
 
																+        doc_copy = docs.copy()
															
 
																         random.shuffle(docs)
															
 
																-
															
 
																         d = {
															
 
																             "title": [],
															
 
																             "sentences": []
															
@@ -221,7 +217,7 @@ def add_chunk_to_dataset(
 
																         context += q
															
 
																         # This instruction will be used in the fine-tuning stage
															
 
																         datapt["instruction"] = context
															
 
																-
															
 
																+        datapt_copy = copy.deepcopy(datapt)
															
 
																         # add to dataset
															
 
																         if not ds:
															
 
																             # init ds
															
@@ -235,4 +231,17 @@ def add_chunk_to_dataset(
 
																             ds = Dataset.from_dict(datapt)
															
 
																         else:
															
 
																             ds = ds.add_item(datapt)
															
 
																+        # decides whether to add refusal example where the related documents are not provided
															
 
																+        oracle = random.uniform(0, 1) < p
															
 
																+        if not oracle:
															
 
																+            doc_copy[0] = chunks[random.sample(indices, 1)[0]]
															
 
																+            random.shuffle(doc_copy)
															
 
																+            context = ""
															
 
																+            for doc in doc_copy:
															
 
																+                context += "<DOCUMENT>" + str(doc) + "</DOCUMENT>\n"
															
 
																+            context += q
															
 
																+            # This instruction will be used in the fine-tuning stage
															
 
																+            datapt_copy["instruction"] = context
															
 
																+            datapt_copy["cot_answer"] = "Sorry, I don't know the answer to this question because related documents are not found. Please try again."
															
 
																+            ds.add_item(datapt_copy)
															
 
																     return ds