1 year ago · bb96a887c9
--- a/recipes/use_cases/end2end-recipes/chatbot/pipelines/README.md
+++ b/recipes/use_cases/end2end-recipes/chatbot/pipelines/README.md
--- a/recipes/use_cases/end2end-recipes/chatbot/pipelines/chat_utils.py
+++ b/recipes/use_cases/end2end-recipes/chatbot/pipelines/chat_utils.py
@@ -5,7 +5,6 @@ from octoai.client import OctoAI
 
				 from functools import partial
			
 
				 from openai import OpenAI
			
 
				 import json
			
 
				-from generator_utils import generate_question_batches, parse_qa_to_json, generate_data_eval
			
 
				 # Configure logging to include the timestamp, log level, and message
			
 
				 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
			
 
				 # Since OctoAI has different naming for llama models, create this mapping to get huggingface offical model name given OctoAI names.
			
@@ -18,14 +17,14 @@ allowed_concurrent_requests = int(rate_limit_threshold * 0.75)
 
				 request_limiter = asyncio.Semaphore(allowed_concurrent_requests)
			
 
				 class ChatService(ABC):
			
 
				     @abstractmethod
			
 
				-    async def execute_chat_request_async(self, api_context: dict, chat_request, eval=False):
			
 
				+    async def execute_chat_request_async(self, api_context: dict, chat_request):
			
 
				         pass
			
 
				 
			
 
				 # Please implement your own chat service class here.
			
 
				 # The class should inherit from the ChatService class and implement the execute_chat_request_async method.
			
 
				 # The following are two example chat service classes that you can use as a reference.
			
 
				 class OctoAIChatService(ChatService):
			
 
				-    async def execute_chat_request_async(self, api_context: dict, chat_request, eval=False):
			
 
				+    async def execute_chat_request_async(self, api_context: dict, chat_request):
			
 
				         async with request_limiter:
			
 
				             try:
			
 
				                 event_loop = asyncio.get_running_loop()
			
@@ -38,41 +37,31 @@ class OctoAIChatService(ChatService):
 
				                 )
			
 
				                 response = await event_loop.run_in_executor(None, api_chat_call)
			
 
				                 assistant_response = next((choice.message.content for choice in response.choices if choice.message.role == 'assistant'), "")
			
 
				-                if eval:
			
 
				-                    assistant_response_json = json.loads(assistant_response)
			
 
				-                else:
			
 
				-                    assistant_response_json = parse_qa_to_json(assistant_response)
			
 
				-
			
 
				-                return assistant_response_json
			
 
				+                return assistant_response
			
 
				             except Exception as error:
			
 
				                 logging.error(f"Error during chat request execution: {error}",exc_info=True)
			
 
				                 return ""
			
 
				 # Use the local vllm openai compatible server for generating question/answer pairs to make API call syntax consistent
			
 
				 # please read for more detail:https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html.
			
 
				 class VllmChatService(ChatService):
			
 
				-    async def execute_chat_request_async(self, api_context: dict, chat_request, eval=False):
			
 
				-        async with request_limiter:
			
 
				-            try:
			
 
				-                event_loop = asyncio.get_running_loop()
			
 
				-                if api_context["model"] in MODEL_NAME_MAPPING:
			
 
				-                    model_name = MODEL_NAME_MAPPING[api_context['model']]
			
 
				-                else:
			
 
				-                    model_name = api_context['model']
			
 
				-                client = OpenAI(api_key=api_context['api_key'], base_url="http://localhost:"+ str(api_context['endpoint'])+"/v1")
			
 
				-                api_chat_call = partial(
			
 
				-                    client.chat.completions.create,
			
 
				-                    model=model_name,
			
 
				-                    messages=chat_request,
			
 
				-                    temperature=0.0
			
 
				-                )
			
 
				-                response = await event_loop.run_in_executor(None, api_chat_call)
			
 
				-                assistant_response = next((choice.message.content for choice in response.choices if choice.message.role == 'assistant'), "")
			
 
				-                if eval:
			
 
				-                    print(assistant_response)
			
 
				-                    assistant_response_json = json.loads(assistant_response)
			
 
				-                else:
			
 
				-                    assistant_response_json = parse_qa_to_json(assistant_response)
			
 
				-                return assistant_response_json
			
 
				-            except Exception as error:
			
 
				-                logging.error(f"Error during chat request execution: {error}",exc_info=True)
			
 
				-                return ""
			
 
				+    async def execute_chat_request_async(self, api_context: dict, chat_request):
			
 
				+        try:
			
 
				+            event_loop = asyncio.get_running_loop()
			
 
				+            if api_context["model"] in MODEL_NAME_MAPPING:
			
 
				+                model_name = MODEL_NAME_MAPPING[api_context['model']]
			
 
				+            else:
			
 
				+                model_name = api_context['model']
			
 
				+            client = OpenAI(api_key=api_context['api_key'], base_url="http://localhost:"+ str(api_context['endpoint'])+"/v1")
			
 
				+            api_chat_call = partial(
			
 
				+                client.chat.completions.create,
			
 
				+                model=model_name,
			
 
				+                messages=chat_request,
			
 
				+                temperature=0.0
			
 
				+            )
			
 
				+            response = await event_loop.run_in_executor(None, api_chat_call)
			
 
				+            assistant_response = next((choice.message.content for choice in response.choices if choice.message.role == 'assistant'), "")
			
 
				+            print("assistant_response",assistant_response)
			
 
				+            return assistant_response
			
 
				+        except Exception as error:
			
 
				+            logging.error(f"Error during chat request execution: {error}",exc_info=True)
			
 
				+            return ""
			
--- a/recipes/use_cases/end2end-recipes/chatbot/pipelines/doc_processor.py
+++ b/recipes/use_cases/end2end-recipes/chatbot/pipelines/doc_processor.py
@@ -1,5 +1,5 @@
 
				 # Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				-# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
			
 
				+# This software may be used and distributed according to the terms of the Llama 3 Community License Agreement.
			
 
				 
			
 
				 # Assuming result_average_token is a constant, use UPPER_CASE for its name to follow Python conventions
			
 
				 AVERAGE_TOKENS_PER_RESULT = 100
			
--- a/recipes/use_cases/end2end-recipes/chatbot/pipelines/eval_chatbot.py
+++ b/recipes/use_cases/end2end-recipes/chatbot/pipelines/eval_chatbot.py
@@ -8,6 +8,7 @@ from config import load_config
 
				 import asyncio
			
 
				 import json
			
 
				 from itertools import chain
			
 
				+from generator_utils import parse_qa_to_json
			
 
				 
			
 
				 def compute_rouge_score(generated : str, reference: str):
			
 
				     rouge_score = evaluate.load('rouge')
			
@@ -24,24 +25,23 @@ def compute_bert_score(generated : str, reference: str):
 
				         references=reference,
			
 
				         lang="en"
			
 
				     )
			
 
				-# This function is used to evaluate the quality of generated QA pairs. Return the original QA pair if the model eval result is YES. Otherwise, return an empty dict.
			
 
				+# This function is used to eval the fine-tuned model, given the question, generate the answer.
			
 
				 async def eval_request(chat_service, api_context: dict, question: str) -> dict:
			
 
				     prompt_for_system = api_context['eval_prompt_template'].format(language=api_context["language"])
			
 
				     chat_request_payload = [{'role': 'system', 'content': prompt_for_system}, {'role': 'user', 'content': f"Question: {question}"}]
			
 
				     # Getting a list of result, in this case, there should be only one result
			
 
				-    results = await chat_service.execute_chat_request_async(api_context, chat_request_payload,eval=False)
			
 
				-    # convert the result string to a list
			
 
				-    results = eval(results)
			
 
				-    if not results or len(results) > 1:
			
 
				-        print("results",type(results),len(results),results)
			
 
				+    response_string = await chat_service.execute_chat_request_async(api_context, chat_request_payload)
			
 
				+    # convert the result string to a dict that contains Question, Answer
			
 
				+    result_list = parse_qa_to_json(response_string)
			
 
				+    if not result_list or len(result_list) > 1:
			
 
				+        print("Error: eval response should be a list of one result dict")
			
 
				         return {}
			
 
				-    result = results[0]
			
 
				+    result = result_list[0]
			
 
				     if "Answer" not in result:
			
 
				         print("Error: eval response does not contain answer")
			
 
				-        print(question,result)
			
 
				         return {}
			
 
				-    print("result",result)
			
 
				     # Send back the model generated answer
			
 
				+
			
 
				     return result["Answer"]
			
 
				 
			
 
				 async def generate_eval_answer(chat_service, api_context: dict, questions: list):
			
--- a/recipes/use_cases/end2end-recipes/chatbot/pipelines/eval_config.yaml
+++ b/recipes/use_cases/end2end-recipes/chatbot/pipelines/eval_config.yaml
@@ -2,11 +2,12 @@ eval_prompt_template: >
 
				   You are a AI assistant that skilled in answering questions related to Llama model.
			
 
				   Below is a question from a llama user, please answer it in {language}, make the answer as concise as possible, it should be at most 100 words.
			
 
				   Return the result with the template:
			
 
				-  {{
			
 
				+  [
			
 
				+    {{
			
 
				       "Question": "The question user asked to you"
			
 
				       "Answer": "Your answer to the question"
			
 
				   }}
			
 
				-
			
 
				+  ]
			
 
				 eval_json: "./evalset.json"
			
 
				 
			
 
				 language: "English"
			
--- a/recipes/use_cases/end2end-recipes/chatbot/pipelines/generate_question_answers.py
+++ b/recipes/use_cases/end2end-recipes/chatbot/pipelines/generate_question_answers.py
@@ -5,7 +5,7 @@ import argparse
 
				 import asyncio
			
 
				 import json
			
 
				 from config import load_config
			
 
				-from generator_utils import generate_question_batches, generate_data_eval
			
 
				+from generator_utils import generate_question_batches, generate_data_curation
			
 
				 from chat_utils import OctoAIChatService, VllmChatService
			
 
				 from itertools import chain
			
 
				 import logging
			
@@ -27,20 +27,15 @@ async def main(context):
 
				         if not data:
			
 
				             logging.warning("No data generated. Please check the input context or model configuration.")
			
 
				             return
			
 
				-        flattened_list = list(chain.from_iterable(data))
			
 
				-        # with open("data.json") as fp:
			
 
				-        #     flattened_list = json.load(fp)
			
 
				-        logging.info(f"Successfully generated {len(flattened_list)} question/answer pairs.")
			
 
				-        # Use asynchronous file operation for writing to the file
			
 
				-
			
 
				-        # async with aiofiles.open("data.json", "w") as output_file:
			
 
				-        #     await output_file.write(json.dumps(flattened_list, indent=4))
			
 
				-        # logging.info("Data successfully written to 'data.json'. Process completed.")
			
 
				-        curated_data = await generate_data_eval(chat_service, context,flattened_list)
			
 
				-        logging.info(f"Only {len(curated_data)} question/answer pairs pass the self-curation")
			
 
				-        async with aiofiles.open("curated_data.json", "w") as curated_data:
			
 
				-             await curated_data.write(json.dumps(flattened_list, indent=4))
			
 
				-        logging.info("Data successfully written to 'curated_data.json'. Process completed.")
			
 
				+        data = list(chain.from_iterable(data))
			
 
				+        logging.info(f"Successfully generated {len(data)} question/answer pairs.")
			
 
				+        if context["use_curation"]:
			
 
				+            logging.info("Starting to do self-curation using LLM.")
			
 
				+            data = await generate_data_curation(chat_service, context,data)
			
 
				+            logging.info(f"Only {len(data)} question/answer pairs pass the self-curation")
			
 
				+        async with aiofiles.open(context['output_path'], "w") as output_file:
			
 
				+             await output_file.write(json.dumps(data, indent=4))
			
 
				+        logging.info(f"Data successfully written to {context['output_path']}. Process completed.")
			
 
				     except Exception as e:
			
 
				         logging.error(f"An unexpected error occurred during the process: {e}",exc_info=True)
			
 
				 
			
@@ -72,6 +67,11 @@ def parse_arguments():
 
				         type=int,
			
 
				         help="If a port is specified, then use local vllm endpoint for generating question/answer pairs."
			
 
				     )
			
 
				+    parser.add_argument(
			
 
				+        "-o", "--output_path",
			
 
				+        default="./data.json",
			
 
				+        help="set the output path for the generated QA pairs. Default is data.json"
			
 
				+    )
			
 
				     return parser.parse_args()
			
 
				 
			
 
				 if __name__ == "__main__":
			
@@ -82,6 +82,9 @@ if __name__ == "__main__":
 
				     context["total_questions"] = args.total_questions
			
 
				     context["model"] = args.model
			
 
				     context["endpoint"] = args.vllm_endpoint
			
 
				+    # If curation prompt is not empty, then use self-curation
			
 
				+    context["use_curation"] = len(context["curation_prompt_template"]) > 0
			
 
				+    context["output_path"] = args.output_path
			
 
				     logging.info(f"Configuration loaded. Generating {args.total_questions} question/answer pairs using model '{args.model}'.")
			
 
				     if context["endpoint"]:
			
 
				         logging.info(f"Use local vllm service at port: '{args.vllm_endpoint}'.")
			
--- a/recipes/use_cases/end2end-recipes/chatbot/pipelines/generation_config.yaml
+++ b/recipes/use_cases/end2end-recipes/chatbot/pipelines/generation_config.yaml
@@ -1,36 +1,46 @@
 
				 question_prompt_template: >
			
 
				   You are a language model skilled in creating quiz questions.
			
 
				   You will be provided with a document,
			
 
				-  read it and please generate question and answer that are most likely be asked by a user of llama model,
			
 
				-  please make sure you follow those rules,
			
 
				-  1. Generate at most {num_questions} question answer pairs, you can generate less questions if you believe there are nothing related to Llama.
			
 
				-  2. Generate in {language}.
			
 
				-  3. The questions can be answered based *solely* on the given passage.
			
 
				-  4. Avoid asking questions with similar meaning.
			
 
				-  5. Make the answer as concise as possible, it should be at most 60 words.
			
 
				-  6. Provide relevant links from the document to support the answer.
			
 
				-  7. Never use any abbreviation.
			
 
				-  8. Return the result in json format with the template:
			
 
				+  read it and please generate question and answer pairs that are most likely be asked by a user of Llama language models,
			
 
				+  which includes LLama, Llama2, Meta Llama3, Code Llama, Meta Llama Guard 1,	Meta Llama Guard 2,
			
 
				+  then extract the context that is related to the question and answer, preferably using the sentences from original text,
			
 
				+  please make sure you follow those rules:
			
 
				+  1. Generate at most {num_questions} question answer pairs, you can generate less questions if you believe there are nothing related to Llama language models.
			
 
				+  2. For each question and answer pair, add the context that is related to the question and answer, preferably using the sentences from original text
			
 
				+  3. Generate in {language}.
			
 
				+  4. The questions can be answered based *solely* on the given passage.
			
 
				+  5. Avoid asking questions with similar meaning.
			
 
				+  6. Make the answer as concise as possible, it should be at most 80 words.
			
 
				+  7. Provide relevant links from the document to support the answer.
			
 
				+  8. Never use any abbreviation.
			
 
				+  9. Return the result in json format with the template:
			
 
				     [
			
 
				       {{
			
 
				         "Question": "your question A.",
			
 
				         "Answer": "your answer to question A."
			
 
				+        "Context": "the context for question A"
			
 
				       }},
			
 
				       {{
			
 
				         "Question": "your question B.",
			
 
				         "Answer": "your answer to question B."
			
 
				+        "Context": "the context for question B"
			
 
				       }}
			
 
				     ]
			
 
				 
			
 
				 curation_prompt_template: >
			
 
				-  Below is a question and answer pair about Llama language model. Evaluate
			
 
				-  whether or not this qusestion and answer pair will be helpful for a user of Llama langauge model.
			
 
				-  Respond with only a single JSON blob with an "explanation" field that is a short (less than 100 word)
			
 
				-  explanation of your answer and an "answer" field which is YES or NO. Only generate the answer in {language}.
			
 
				+  Below is a question and answer pair (QA pair) and its related context about Llama language models,
			
 
				+  which includes LLama, Llama2, Meta Llama3, Code Llama, Meta Llama Guard 1,	Meta Llama Guard 2.
			
 
				+  Given the context, evaluate whether or not this qusestion and answer pair will be helpful for a user of Llama language models,
			
 
				+  and whether this question and answer is relevant to the context.
			
 
				+  Note that the answer in the QA pair can be the same or similar as the context, as repetition of context is allowed.
			
 
				+  Respond with only a single JSON blob with an "Reason" field that is a short (less than 100 word)
			
 
				+  explanation of your answer and an "Result" field which is YES or NO.
			
 
				+  Only answer "YES", if the question and answer pair is based on the context and provides relevant information about Llama language models.
			
 
				+  Only generate the answer in {language}.
			
 
				   Return the result in json format with the template:
			
 
				     {{
			
 
				       "Reason": "your reason here.",
			
 
				-      "Answer": "YES or No."
			
 
				+      "Result": "YES or No."
			
 
				     }},
			
 
				 
			
 
				 data_dir: "./data"
			
--- a/recipes/use_cases/end2end-recipes/chatbot/pipelines/generator_utils.py
+++ b/recipes/use_cases/end2end-recipes/chatbot/pipelines/generator_utils.py
@@ -3,6 +3,7 @@
 
				 
			
 
				 import os
			
 
				 import re
			
 
				+import string
			
 
				 from transformers import  AutoTokenizer
			
 
				 import asyncio
			
 
				 import magic
			
@@ -64,7 +65,9 @@ def process_file(file_path):
 
				     else:
			
 
				         logging.warning(f"Unsupported file type {file_type} for file {file_path}")
			
 
				         return ''
			
 
				-
			
 
				+def remove_non_printable(s):
			
 
				+    printable = set(string.printable)
			
 
				+    return ''.join(filter(lambda x: x in printable, s))
			
 
				 def read_file_content(context):
			
 
				     file_strings = []
			
 
				 
			
@@ -74,10 +77,8 @@ def read_file_content(context):
 
				             file_text = process_file(file_path)
			
 
				             if file_text:
			
 
				                 file_strings.append(file_text)
			
 
				-    text = ' '.join(file_strings)
			
 
				-    if len(text) == 0:
			
 
				-        logging.error(f"Error reading files, text is empty")
			
 
				-    return ' '.join(file_strings)
			
 
				+    text = '\n'.join(file_strings)
			
 
				+    return remove_non_printable(text)
			
 
				 # clean the text by removing all parts that did not contain any alphanumeric characters
			
 
				 def clean(s):
			
 
				         result = []
			
@@ -86,6 +87,42 @@ def clean(s):
 
				                 result.append(item)
			
 
				         return " ".join(result)
			
 
				 # given a response string, return a string that can be saved as json.
			
 
				+def parse_qac_to_json(response_string):
			
 
				+    split_lines = response_string.split("\n")
			
 
				+    start,mid,end = None,None,None
			
 
				+    # must use set to avoid duplicate question/answer pairs due to async function calls
			
 
				+    qa_set = set()
			
 
				+    for i in range(len(split_lines)):
			
 
				+        line = split_lines[i]
			
 
				+        # starting to find "Question"
			
 
				+        if not start:
			
 
				+            # Once found, set start to this line number
			
 
				+            if '"Question":' in line:
			
 
				+                start = i
			
 
				+        else:
			
 
				+            # "Question" has been found, find "Answer", once found, set end to this line number
			
 
				+            if '"Answer":' in line:
			
 
				+                mid = i
			
 
				+            elif '"Context":' in line:
			
 
				+                end = i
			
 
				+            # found Question means we have reached the end of the question, so add it to qa_list
			
 
				+            elif '"Question":' in line:
			
 
				+                question = " ".join(split_lines[start:mid]).split('"Question":')[1]
			
 
				+                answer = " ".join(split_lines[mid:end]).split('"Answer":')[1]
			
 
				+                context = " ".join(split_lines[end:i]).split('"Context":')[1]
			
 
				+                start,mid,end = i,None,None
			
 
				+                qa_set.add((clean(question), clean(answer),clean(context)))
			
 
				+        # adding last question back to qa_list
			
 
				+    if start and mid and end:
			
 
				+        question = " ".join(split_lines[start:mid]).split('"Question":')[1]
			
 
				+        answer = " ".join(split_lines[mid:end]).split('"Answer":')[1]
			
 
				+        context = " ".join(split_lines[end:]).split('"Context":')[1]
			
 
				+        start,mid,end = i,None,None
			
 
				+        qa_set.add((clean(question), clean(answer),clean(context)))
			
 
				+    qa_list = [{"Question": q, "Answer":a, "Context":c} for q,a,c in qa_set]
			
 
				+
			
 
				+    return json.dumps(qa_list, indent=4)
			
 
				+
			
 
				 def parse_qa_to_json(response_string):
			
 
				     split_lines = response_string.split("\n")
			
 
				     start,end = None,None
			
@@ -115,29 +152,32 @@ def parse_qa_to_json(response_string):
 
				         qa_set.add((clean(question), clean(answer)))
			
 
				     qa_list = [{"Question": q, "Answer":a} for q,a in qa_set]
			
 
				 
			
 
				-    return json.dumps(qa_list, indent=4)
			
 
				-
			
 
				+    return qa_list
			
 
				 
			
 
				 async def prepare_and_send_request(chat_service, api_context: dict, document_content: str, num_questions: int) -> dict:
			
 
				     prompt_for_system = api_context['question_prompt_template'].format(num_questions=num_questions, language=api_context["language"])
			
 
				     chat_request_payload = [{'role': 'system', 'content': prompt_for_system}, {'role': 'user', 'content': document_content}]
			
 
				-    result = await chat_service.execute_chat_request_async(api_context, chat_request_payload,eval=False)
			
 
				+    result = await chat_service.execute_chat_request_async(api_context, chat_request_payload)
			
 
				+    # parse the result string to a list of dict that has Question, Answer, Context
			
 
				+    result = parse_qac_to_json(result)
			
 
				     if not result:
			
 
				         return {}
			
 
				     return json.loads(await chat_service.execute_chat_request_async(api_context, chat_request_payload,eval=False))
			
 
				 # This function is used to evaluate the quality of generated QA pairs. Return the original QA pair if the model eval result is YES. Otherwise, return an empty dict.
			
 
				-async def data_eval_request(chat_service, api_context: dict, document_content: dict) -> dict:
			
 
				+async def data_curation_request(chat_service, api_context: dict, document_content: dict) -> dict:
			
 
				     prompt_for_system = api_context['curation_prompt_template'].format(language=api_context["language"])
			
 
				-    chat_request_payload = [{'role': 'system', 'content': prompt_for_system}, {'role': 'user', 'content': f"Question: {document_content['Question']}, Answer: {document_content['Answer']}"}]
			
 
				-    result = await chat_service.execute_chat_request_async(api_context, chat_request_payload,eval=True)
			
 
				+    chat_request_payload = [{'role': 'system', 'content': prompt_for_system}, {'role': 'user', 'content': f"Question: {document_content['Question']} \n Answer: {document_content['Answer']}\n Context: {document_content['Context']} "}]
			
 
				+    result = await chat_service.execute_chat_request_async(api_context, chat_request_payload)
			
 
				     if not result:
			
 
				         return {}
			
 
				-    if "Answer" not in result:
			
 
				+    # no parsing needed, just return the loads the result as a dict
			
 
				+    result = json.loads(result)
			
 
				+    if "Result" not in result:
			
 
				         print("Error: eval response does not contain answer")
			
 
				         print(document_content,result)
			
 
				         return {}
			
 
				     # Send back the original QA pair is the model eval result is YES
			
 
				-    if result["Answer"] == "YES":
			
 
				+    if result["Result"] == "YES":
			
 
				         return document_content
			
 
				     else:
			
 
				         print(document_content,result)
			
@@ -177,11 +217,11 @@ async def generate_question_batches(chat_service, api_context: dict):
 
				 
			
 
				     return question_generation_results
			
 
				 
			
 
				-async def generate_data_eval(chat_service, api_context: dict, generated_questions: list):
			
 
				+async def generate_data_curation(chat_service, api_context: dict, generated_questions: list):
			
 
				     eval_tasks = []
			
 
				     for batch_index, batch_content in enumerate(generated_questions):
			
 
				         try:
			
 
				-            result = data_eval_request(chat_service, api_context, batch_content)
			
 
				+            result = data_curation_request(chat_service, api_context, batch_content)
			
 
				             eval_tasks.append(result)
			
 
				         except Exception as e:
			
 
				             print(f"Error during data eval request execution: {e}")