Browse Source

changed raft_dataset.py

Kai Wu 1 năm trước cách đây
mục cha
commit
890d49d45b

+ 11 - 8
recipes/finetuning/datasets/raft_dataset.py

@@ -64,21 +64,24 @@ def tokenize_dialog(dialog, tokenizer):
 
     return dict(combined_tokens, attention_mask=[1]*len(combined_tokens["input_ids"]))
 def raft_tokenize(q_a_pair, tokenizer):
-    # last line is the question
-    question = q_a_pair["instruction"].split('\n')[-1]
-    # all the lines before the last line are the context
-    documents = q_a_pair["instruction"].split('\n')[:-1]
+    end_tag = "<\/DOCUMENT>\n"
+    # find the last end_tag in the instruction, the rest is the question
+    index =q_a_pair["instruction"].rindex("<\/DOCUMENT>\n")+len(end_tag)
+    question = q_a_pair["instruction"][index:]
+    # all the lines before end_tag are the context
+    documents = q_a_pair["instruction"][:index]
     # output is the label
     answer = q_a_pair["output"]
     system_prompt = "You are a helpful chatbot who can provide an answer to every questions from the user given a relevant context."
     user_prompt = """
         Question: {question}\nContext: {context}\n
-        Answer this question using the information given multiple documents in the context above. Here is things to pay attention to:
+        Answer this question using the information given by multiple documents in the context above. Here are things to pay attention to:
+        - The context contains many documents, each document starts with <DOCUMENT> and ends </DOCUMENT>.
         - First provide step-by-step reasoning on how to answer the question.
         - In the reasoning, if you need to copy paste some sentences from the context, include them in ##begin_quote## and ##end_quote##. This would mean that things outside of ##begin_quote## and ##end_quote## are not directly copy paste from the context.
-        - End your response with final answer in the form <ANSWER>: $answer, the answer should be succinct.
-        You MUST begin your final answer with the tag "<ANSWER>:".
-    """.format(question=question, context=str(documents))
+        - End your response with final answer in the form <ANSWER>: $answer, the answer should less than 60 words.
+        You MUST begin your final answer with the tag "<ANSWER>
+    """.format(question=question, context=documents)
 
     chat = [
     {"role": "system", "content": system_prompt},

Những thai đổi đã bị hủy bỏ vì nó quá lớn
+ 103 - 0
recipes/use_cases/end2end-recipes/raft/data/llama_website0613


+ 1 - 1
recipes/use_cases/end2end-recipes/raft/raft.yaml

@@ -2,7 +2,7 @@ COT_prompt_template: >
   <|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a helpful chatbot who can provide an answer to every questions from the user given a relevant context.<|eot_id|>
   <|start_header_id|>user<|end_header_id|>
   Question: {question}\nContext: {context}\n
-  Answer this question using the information given by multiple documents in the context above. Here is things to pay attention to:
+  Answer this question using the information given by multiple documents in the context above. Here are things to pay attention to:
   - The context contains many documents, each document starts with <DOCUMENT> and ends </DOCUMENT>.
   - First provide step-by-step reasoning on how to answer the question.
   - In the reasoning, if you need to copy paste some sentences from the context, include them in ##begin_quote## and ##end_quote##. This would mean that things outside of ##begin_quote## and ##end_quote## are not directly copy paste from the context.

+ 1 - 1
recipes/use_cases/end2end-recipes/raft/raft_eval_config.yaml

@@ -22,7 +22,7 @@ RAG_prompt_template: >
   <|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a helpful chatbot who can provide an answer to every questions from the user given a relevant context.<|eot_id|>
   <|start_header_id|>user<|end_header_id|>
   Question: {question}\nContext: {context}\n
-  Answer this question using the information given by multiple documents in the context above. Here is things to pay attention to:
+  Answer this question using the information given by multiple documents in the context above. Here are things to pay attention to:
   - The context contains many documents, each document starts with <DOCUMENT> and ends </DOCUMENT>.
   - First provide step-by-step reasoning on how to answer the question.
   - In the reasoning, if you need to copy paste some sentences from the context, include them in ##begin_quote## and ##end_quote##. This would mean that things outside of ##begin_quote## and ##end_quote## are not directly copy paste from the context.