Sfoglia il codice sorgente

end-to-end testing on the pipeline

Kai Wu 1 anno fa
parent
commit
bb96a887c9

File diff suppressed because it is too large
+ 17 - 10
recipes/use_cases/end2end-recipes/chatbot/pipelines/README.md


+ 24 - 35
recipes/use_cases/end2end-recipes/chatbot/pipelines/chat_utils.py

@@ -5,7 +5,6 @@ from octoai.client import OctoAI
 from functools import partial
 from openai import OpenAI
 import json
-from generator_utils import generate_question_batches, parse_qa_to_json, generate_data_eval
 # Configure logging to include the timestamp, log level, and message
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 # Since OctoAI has different naming for llama models, create this mapping to get huggingface offical model name given OctoAI names.
@@ -18,14 +17,14 @@ allowed_concurrent_requests = int(rate_limit_threshold * 0.75)
 request_limiter = asyncio.Semaphore(allowed_concurrent_requests)
 class ChatService(ABC):
     @abstractmethod
-    async def execute_chat_request_async(self, api_context: dict, chat_request, eval=False):
+    async def execute_chat_request_async(self, api_context: dict, chat_request):
         pass
 
 # Please implement your own chat service class here.
 # The class should inherit from the ChatService class and implement the execute_chat_request_async method.
 # The following are two example chat service classes that you can use as a reference.
 class OctoAIChatService(ChatService):
-    async def execute_chat_request_async(self, api_context: dict, chat_request, eval=False):
+    async def execute_chat_request_async(self, api_context: dict, chat_request):
         async with request_limiter:
             try:
                 event_loop = asyncio.get_running_loop()
@@ -38,41 +37,31 @@ class OctoAIChatService(ChatService):
                 )
                 response = await event_loop.run_in_executor(None, api_chat_call)
                 assistant_response = next((choice.message.content for choice in response.choices if choice.message.role == 'assistant'), "")
-                if eval:
-                    assistant_response_json = json.loads(assistant_response)
-                else:
-                    assistant_response_json = parse_qa_to_json(assistant_response)
-
-                return assistant_response_json
+                return assistant_response
             except Exception as error:
                 logging.error(f"Error during chat request execution: {error}",exc_info=True)
                 return ""
 # Use the local vllm openai compatible server for generating question/answer pairs to make API call syntax consistent
 # please read for more detail:https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html.
 class VllmChatService(ChatService):
-    async def execute_chat_request_async(self, api_context: dict, chat_request, eval=False):
-        async with request_limiter:
-            try:
-                event_loop = asyncio.get_running_loop()
-                if api_context["model"] in MODEL_NAME_MAPPING:
-                    model_name = MODEL_NAME_MAPPING[api_context['model']]
-                else:
-                    model_name = api_context['model']
-                client = OpenAI(api_key=api_context['api_key'], base_url="http://localhost:"+ str(api_context['endpoint'])+"/v1")
-                api_chat_call = partial(
-                    client.chat.completions.create,
-                    model=model_name,
-                    messages=chat_request,
-                    temperature=0.0
-                )
-                response = await event_loop.run_in_executor(None, api_chat_call)
-                assistant_response = next((choice.message.content for choice in response.choices if choice.message.role == 'assistant'), "")
-                if eval:
-                    print(assistant_response)
-                    assistant_response_json = json.loads(assistant_response)
-                else:
-                    assistant_response_json = parse_qa_to_json(assistant_response)
-                return assistant_response_json
-            except Exception as error:
-                logging.error(f"Error during chat request execution: {error}",exc_info=True)
-                return ""
+    async def execute_chat_request_async(self, api_context: dict, chat_request):
+        try:
+            event_loop = asyncio.get_running_loop()
+            if api_context["model"] in MODEL_NAME_MAPPING:
+                model_name = MODEL_NAME_MAPPING[api_context['model']]
+            else:
+                model_name = api_context['model']
+            client = OpenAI(api_key=api_context['api_key'], base_url="http://localhost:"+ str(api_context['endpoint'])+"/v1")
+            api_chat_call = partial(
+                client.chat.completions.create,
+                model=model_name,
+                messages=chat_request,
+                temperature=0.0
+            )
+            response = await event_loop.run_in_executor(None, api_chat_call)
+            assistant_response = next((choice.message.content for choice in response.choices if choice.message.role == 'assistant'), "")
+            print("assistant_response",assistant_response)
+            return assistant_response
+        except Exception as error:
+            logging.error(f"Error during chat request execution: {error}",exc_info=True)
+            return ""

+ 1 - 1
recipes/use_cases/end2end-recipes/chatbot/pipelines/doc_processor.py

@@ -1,5 +1,5 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
+# This software may be used and distributed according to the terms of the Llama 3 Community License Agreement.
 
 # Assuming result_average_token is a constant, use UPPER_CASE for its name to follow Python conventions
 AVERAGE_TOKENS_PER_RESULT = 100

+ 9 - 9
recipes/use_cases/end2end-recipes/chatbot/pipelines/eval_chatbot.py

@@ -8,6 +8,7 @@ from config import load_config
 import asyncio
 import json
 from itertools import chain
+from generator_utils import parse_qa_to_json
 
 def compute_rouge_score(generated : str, reference: str):
     rouge_score = evaluate.load('rouge')
@@ -24,24 +25,23 @@ def compute_bert_score(generated : str, reference: str):
         references=reference,
         lang="en"
     )
-# This function is used to evaluate the quality of generated QA pairs. Return the original QA pair if the model eval result is YES. Otherwise, return an empty dict.
+# This function is used to eval the fine-tuned model, given the question, generate the answer.
 async def eval_request(chat_service, api_context: dict, question: str) -> dict:
     prompt_for_system = api_context['eval_prompt_template'].format(language=api_context["language"])
     chat_request_payload = [{'role': 'system', 'content': prompt_for_system}, {'role': 'user', 'content': f"Question: {question}"}]
     # Getting a list of result, in this case, there should be only one result
-    results = await chat_service.execute_chat_request_async(api_context, chat_request_payload,eval=False)
-    # convert the result string to a list
-    results = eval(results)
-    if not results or len(results) > 1:
-        print("results",type(results),len(results),results)
+    response_string = await chat_service.execute_chat_request_async(api_context, chat_request_payload)
+    # convert the result string to a dict that contains Question, Answer
+    result_list = parse_qa_to_json(response_string)
+    if not result_list or len(result_list) > 1:
+        print("Error: eval response should be a list of one result dict")
         return {}
-    result = results[0]
+    result = result_list[0]
     if "Answer" not in result:
         print("Error: eval response does not contain answer")
-        print(question,result)
         return {}
-    print("result",result)
     # Send back the model generated answer
+
     return result["Answer"]
 
 async def generate_eval_answer(chat_service, api_context: dict, questions: list):

+ 3 - 2
recipes/use_cases/end2end-recipes/chatbot/pipelines/eval_config.yaml

@@ -2,11 +2,12 @@ eval_prompt_template: >
   You are a AI assistant that skilled in answering questions related to Llama model.
   Below is a question from a llama user, please answer it in {language}, make the answer as concise as possible, it should be at most 100 words.
   Return the result with the template:
-  {{
+  [
+    {{
       "Question": "The question user asked to you"
       "Answer": "Your answer to the question"
   }}
-
+  ]
 eval_json: "./evalset.json"
 
 language: "English"

+ 18 - 15
recipes/use_cases/end2end-recipes/chatbot/pipelines/generate_question_answers.py

@@ -5,7 +5,7 @@ import argparse
 import asyncio
 import json
 from config import load_config
-from generator_utils import generate_question_batches, generate_data_eval
+from generator_utils import generate_question_batches, generate_data_curation
 from chat_utils import OctoAIChatService, VllmChatService
 from itertools import chain
 import logging
@@ -27,20 +27,15 @@ async def main(context):
         if not data:
             logging.warning("No data generated. Please check the input context or model configuration.")
             return
-        flattened_list = list(chain.from_iterable(data))
-        # with open("data.json") as fp:
-        #     flattened_list = json.load(fp)
-        logging.info(f"Successfully generated {len(flattened_list)} question/answer pairs.")
-        # Use asynchronous file operation for writing to the file
-
-        # async with aiofiles.open("data.json", "w") as output_file:
-        #     await output_file.write(json.dumps(flattened_list, indent=4))
-        # logging.info("Data successfully written to 'data.json'. Process completed.")
-        curated_data = await generate_data_eval(chat_service, context,flattened_list)
-        logging.info(f"Only {len(curated_data)} question/answer pairs pass the self-curation")
-        async with aiofiles.open("curated_data.json", "w") as curated_data:
-             await curated_data.write(json.dumps(flattened_list, indent=4))
-        logging.info("Data successfully written to 'curated_data.json'. Process completed.")
+        data = list(chain.from_iterable(data))
+        logging.info(f"Successfully generated {len(data)} question/answer pairs.")
+        if context["use_curation"]:
+            logging.info("Starting to do self-curation using LLM.")
+            data = await generate_data_curation(chat_service, context,data)
+            logging.info(f"Only {len(data)} question/answer pairs pass the self-curation")
+        async with aiofiles.open(context['output_path'], "w") as output_file:
+             await output_file.write(json.dumps(data, indent=4))
+        logging.info(f"Data successfully written to {context['output_path']}. Process completed.")
     except Exception as e:
         logging.error(f"An unexpected error occurred during the process: {e}",exc_info=True)
 
@@ -72,6 +67,11 @@ def parse_arguments():
         type=int,
         help="If a port is specified, then use local vllm endpoint for generating question/answer pairs."
     )
+    parser.add_argument(
+        "-o", "--output_path",
+        default="./data.json",
+        help="set the output path for the generated QA pairs. Default is data.json"
+    )
     return parser.parse_args()
 
 if __name__ == "__main__":
@@ -82,6 +82,9 @@ if __name__ == "__main__":
     context["total_questions"] = args.total_questions
     context["model"] = args.model
     context["endpoint"] = args.vllm_endpoint
+    # If curation prompt is not empty, then use self-curation
+    context["use_curation"] = len(context["curation_prompt_template"]) > 0
+    context["output_path"] = args.output_path
     logging.info(f"Configuration loaded. Generating {args.total_questions} question/answer pairs using model '{args.model}'.")
     if context["endpoint"]:
         logging.info(f"Use local vllm service at port: '{args.vllm_endpoint}'.")

+ 25 - 15
recipes/use_cases/end2end-recipes/chatbot/pipelines/generation_config.yaml

@@ -1,36 +1,46 @@
 question_prompt_template: >
   You are a language model skilled in creating quiz questions.
   You will be provided with a document,
-  read it and please generate question and answer that are most likely be asked by a user of llama model,
-  please make sure you follow those rules,
-  1. Generate at most {num_questions} question answer pairs, you can generate less questions if you believe there are nothing related to Llama.
-  2. Generate in {language}.
-  3. The questions can be answered based *solely* on the given passage.
-  4. Avoid asking questions with similar meaning.
-  5. Make the answer as concise as possible, it should be at most 60 words.
-  6. Provide relevant links from the document to support the answer.
-  7. Never use any abbreviation.
-  8. Return the result in json format with the template:
+  read it and please generate question and answer pairs that are most likely be asked by a user of Llama language models,
+  which includes LLama, Llama2, Meta Llama3, Code Llama, Meta Llama Guard 1,	Meta Llama Guard 2,
+  then extract the context that is related to the question and answer, preferably using the sentences from original text,
+  please make sure you follow those rules:
+  1. Generate at most {num_questions} question answer pairs, you can generate less questions if you believe there are nothing related to Llama language models.
+  2. For each question and answer pair, add the context that is related to the question and answer, preferably using the sentences from original text
+  3. Generate in {language}.
+  4. The questions can be answered based *solely* on the given passage.
+  5. Avoid asking questions with similar meaning.
+  6. Make the answer as concise as possible, it should be at most 80 words.
+  7. Provide relevant links from the document to support the answer.
+  8. Never use any abbreviation.
+  9. Return the result in json format with the template:
     [
       {{
         "Question": "your question A.",
         "Answer": "your answer to question A."
+        "Context": "the context for question A"
       }},
       {{
         "Question": "your question B.",
         "Answer": "your answer to question B."
+        "Context": "the context for question B"
       }}
     ]
 
 curation_prompt_template: >
-  Below is a question and answer pair about Llama language model. Evaluate
-  whether or not this qusestion and answer pair will be helpful for a user of Llama langauge model.
-  Respond with only a single JSON blob with an "explanation" field that is a short (less than 100 word)
-  explanation of your answer and an "answer" field which is YES or NO. Only generate the answer in {language}.
+  Below is a question and answer pair (QA pair) and its related context about Llama language models,
+  which includes LLama, Llama2, Meta Llama3, Code Llama, Meta Llama Guard 1,	Meta Llama Guard 2.
+  Given the context, evaluate whether or not this qusestion and answer pair will be helpful for a user of Llama language models,
+  and whether this question and answer is relevant to the context.
+  Note that the answer in the QA pair can be the same or similar as the context, as repetition of context is allowed.
+  Respond with only a single JSON blob with an "Reason" field that is a short (less than 100 word)
+  explanation of your answer and an "Result" field which is YES or NO.
+  Only answer "YES", if the question and answer pair is based on the context and provides relevant information about Llama language models.
+  Only generate the answer in {language}.
   Return the result in json format with the template:
     {{
       "Reason": "your reason here.",
-      "Answer": "YES or No."
+      "Result": "YES or No."
     }},
 
 data_dir: "./data"

+ 55 - 15
recipes/use_cases/end2end-recipes/chatbot/pipelines/generator_utils.py

@@ -3,6 +3,7 @@
 
 import os
 import re
+import string
 from transformers import  AutoTokenizer
 import asyncio
 import magic
@@ -64,7 +65,9 @@ def process_file(file_path):
     else:
         logging.warning(f"Unsupported file type {file_type} for file {file_path}")
         return ''
-
+def remove_non_printable(s):
+    printable = set(string.printable)
+    return ''.join(filter(lambda x: x in printable, s))
 def read_file_content(context):
     file_strings = []
 
@@ -74,10 +77,8 @@ def read_file_content(context):
             file_text = process_file(file_path)
             if file_text:
                 file_strings.append(file_text)
-    text = ' '.join(file_strings)
-    if len(text) == 0:
-        logging.error(f"Error reading files, text is empty")
-    return ' '.join(file_strings)
+    text = '\n'.join(file_strings)
+    return remove_non_printable(text)
 # clean the text by removing all parts that did not contain any alphanumeric characters
 def clean(s):
         result = []
@@ -86,6 +87,42 @@ def clean(s):
                 result.append(item)
         return " ".join(result)
 # given a response string, return a string that can be saved as json.
+def parse_qac_to_json(response_string):
+    split_lines = response_string.split("\n")
+    start,mid,end = None,None,None
+    # must use set to avoid duplicate question/answer pairs due to async function calls
+    qa_set = set()
+    for i in range(len(split_lines)):
+        line = split_lines[i]
+        # starting to find "Question"
+        if not start:
+            # Once found, set start to this line number
+            if '"Question":' in line:
+                start = i
+        else:
+            # "Question" has been found, find "Answer", once found, set end to this line number
+            if '"Answer":' in line:
+                mid = i
+            elif '"Context":' in line:
+                end = i
+            # found Question means we have reached the end of the question, so add it to qa_list
+            elif '"Question":' in line:
+                question = " ".join(split_lines[start:mid]).split('"Question":')[1]
+                answer = " ".join(split_lines[mid:end]).split('"Answer":')[1]
+                context = " ".join(split_lines[end:i]).split('"Context":')[1]
+                start,mid,end = i,None,None
+                qa_set.add((clean(question), clean(answer),clean(context)))
+        # adding last question back to qa_list
+    if start and mid and end:
+        question = " ".join(split_lines[start:mid]).split('"Question":')[1]
+        answer = " ".join(split_lines[mid:end]).split('"Answer":')[1]
+        context = " ".join(split_lines[end:]).split('"Context":')[1]
+        start,mid,end = i,None,None
+        qa_set.add((clean(question), clean(answer),clean(context)))
+    qa_list = [{"Question": q, "Answer":a, "Context":c} for q,a,c in qa_set]
+
+    return json.dumps(qa_list, indent=4)
+
 def parse_qa_to_json(response_string):
     split_lines = response_string.split("\n")
     start,end = None,None
@@ -115,29 +152,32 @@ def parse_qa_to_json(response_string):
         qa_set.add((clean(question), clean(answer)))
     qa_list = [{"Question": q, "Answer":a} for q,a in qa_set]
 
-    return json.dumps(qa_list, indent=4)
-
+    return qa_list
 
 async def prepare_and_send_request(chat_service, api_context: dict, document_content: str, num_questions: int) -> dict:
     prompt_for_system = api_context['question_prompt_template'].format(num_questions=num_questions, language=api_context["language"])
     chat_request_payload = [{'role': 'system', 'content': prompt_for_system}, {'role': 'user', 'content': document_content}]
-    result = await chat_service.execute_chat_request_async(api_context, chat_request_payload,eval=False)
+    result = await chat_service.execute_chat_request_async(api_context, chat_request_payload)
+    # parse the result string to a list of dict that has Question, Answer, Context
+    result = parse_qac_to_json(result)
     if not result:
         return {}
     return json.loads(await chat_service.execute_chat_request_async(api_context, chat_request_payload,eval=False))
 # This function is used to evaluate the quality of generated QA pairs. Return the original QA pair if the model eval result is YES. Otherwise, return an empty dict.
-async def data_eval_request(chat_service, api_context: dict, document_content: dict) -> dict:
+async def data_curation_request(chat_service, api_context: dict, document_content: dict) -> dict:
     prompt_for_system = api_context['curation_prompt_template'].format(language=api_context["language"])
-    chat_request_payload = [{'role': 'system', 'content': prompt_for_system}, {'role': 'user', 'content': f"Question: {document_content['Question']}, Answer: {document_content['Answer']}"}]
-    result = await chat_service.execute_chat_request_async(api_context, chat_request_payload,eval=True)
+    chat_request_payload = [{'role': 'system', 'content': prompt_for_system}, {'role': 'user', 'content': f"Question: {document_content['Question']} \n Answer: {document_content['Answer']}\n Context: {document_content['Context']} "}]
+    result = await chat_service.execute_chat_request_async(api_context, chat_request_payload)
     if not result:
         return {}
-    if "Answer" not in result:
+    # no parsing needed, just return the loads the result as a dict
+    result = json.loads(result)
+    if "Result" not in result:
         print("Error: eval response does not contain answer")
         print(document_content,result)
         return {}
     # Send back the original QA pair is the model eval result is YES
-    if result["Answer"] == "YES":
+    if result["Result"] == "YES":
         return document_content
     else:
         print(document_content,result)
@@ -177,11 +217,11 @@ async def generate_question_batches(chat_service, api_context: dict):
 
     return question_generation_results
 
-async def generate_data_eval(chat_service, api_context: dict, generated_questions: list):
+async def generate_data_curation(chat_service, api_context: dict, generated_questions: list):
     eval_tasks = []
     for batch_index, batch_content in enumerate(generated_questions):
         try:
-            result = data_eval_request(chat_service, api_context, batch_content)
+            result = data_curation_request(chat_service, api_context, batch_content)
             eval_tasks.append(result)
         except Exception as e:
             print(f"Error during data eval request execution: {e}")