2 年之前 · d097c9f52e
--- a/recipes/use_cases/end2end-recipes/raft/README.md
+++ b/recipes/use_cases/end2end-recipes/raft/README.md
--- a/recipes/use_cases/end2end-recipes/raft/chat_utils.py
+++ b/recipes/use_cases/end2end-recipes/raft/chat_utils.py
@@ -0,0 +1,80 @@
 
				+import asyncio
			
 
				+import logging
			
 
				+from abc import ABC, abstractmethod
			
 
				+from octoai.client import OctoAI
			
 
				+from functools import partial
			
 
				+from openai import OpenAI
			
 
				+import json
			
 
				+# Configure logging to include the timestamp, log level, and message
			
 
				+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
			
 
				+# Since OctoAI has different naming for llama models, create this mapping to get huggingface offical model name given OctoAI names.
			
 
				+MODEL_NAME_MAPPING={"meta-llama-3-70b-instruct":"meta-llama/Meta-Llama-3-70B-Instruct",
			
 
				+"meta-llama-3-8b-instruct":"meta-llama/Meta-Llama-3-8B-Instruct","llama-2-7b-chat":"meta-llama/Llama-2-7b-chat-hf"
			
 
				+,"llama-2-70b-chat":"meta-llama/Llama-2-70b-chat-hf"}
			
 
				+# Manage rate limits with throttling
			
 
				+rate_limit_threshold = 2000
			
 
				+allowed_concurrent_requests = int(rate_limit_threshold * 0.75)
			
 
				+request_limiter = asyncio.Semaphore(allowed_concurrent_requests)
			
 
				+class ChatService(ABC):
			
 
				+    @abstractmethod
			
 
				+    async def execute_chat_request_async(self, api_context: dict, chat_request):
			
 
				+        pass
			
 
				+def strip_str(s: str) -> str:
			
 
				+    """
			
 
				+    Helper function for helping format strings returned by GPT-4.
			
 
				+    """
			
 
				+    l, r = 0, len(s)-1
			
 
				+    beg_found = False
			
 
				+    for i in range(len(s)):
			
 
				+        if s[i].isalpha():
			
 
				+            if not beg_found:
			
 
				+                l = i
			
 
				+                beg_found = True
			
 
				+            else:
			
 
				+                r = i
			
 
				+    r += 2
			
 
				+    return s[l:min(r, len(s))]
			
 
				+# Please implement your own chat service class here.
			
 
				+# The class should inherit from the ChatService class and implement the execute_chat_request_async method.
			
 
				+# The following are two example chat service classes that you can use as a reference.
			
 
				+class OctoAIChatService(ChatService):
			
 
				+    async def execute_chat_request_async(self, api_context: dict, chat_request):
			
 
				+        async with request_limiter:
			
 
				+            try:
			
 
				+                event_loop = asyncio.get_running_loop()
			
 
				+                client = OctoAI(api_context['api_key'])
			
 
				+                api_chat_call = partial(
			
 
				+                    client.chat.completions.create,
			
 
				+                    model=api_context['model'],
			
 
				+                    messages=chat_request,
			
 
				+                    temperature=0.0
			
 
				+                )
			
 
				+                response = await event_loop.run_in_executor(None, api_chat_call)
			
 
				+                assistant_response = next((choice.message.content for choice in response.choices if choice.message.role == 'assistant'), "")
			
 
				+                return assistant_response
			
 
				+            except Exception as error:
			
 
				+                logging.error(f"Error during chat request execution: {error}",exc_info=True)
			
 
				+                return ""
			
 
				+# Use the local vllm openai compatible server for generating question/answer pairs to make API call syntax consistent
			
 
				+# please read for more detail:https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html.
			
 
				+class VllmChatService(ChatService):
			
 
				+    async def execute_chat_request_async(self, api_context: dict, chat_request):
			
 
				+        try:
			
 
				+            event_loop = asyncio.get_running_loop()
			
 
				+            if api_context["model"] in MODEL_NAME_MAPPING:
			
 
				+                model_name = MODEL_NAME_MAPPING[api_context['model']]
			
 
				+            else:
			
 
				+                model_name = api_context['model']
			
 
				+            client = OpenAI(api_key=api_context['api_key'], base_url="http://localhost:"+ str(api_context['endpoint'])+"/v1")
			
 
				+            api_chat_call = partial(
			
 
				+                client.chat.completions.create,
			
 
				+                model=model_name,
			
 
				+                messages=chat_request,
			
 
				+                temperature=0.0
			
 
				+            )
			
 
				+            response = await event_loop.run_in_executor(None, api_chat_call)
			
 
				+            assistant_response = next((choice.message.content for choice in response.choices if choice.message.role == 'assistant'), "")
			
 
				+            return assistant_response
			
 
				+        except Exception as error:
			
 
				+            logging.error(f"Error during chat request execution: {error}",exc_info=True)
			
 
				+            return ""
			
--- a/recipes/use_cases/end2end-recipes/raft/config.py
+++ b/recipes/use_cases/end2end-recipes/raft/config.py
@@ -0,0 +1,19 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				+# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
			
 
				+
			
 
				+import yaml
			
 
				+import os
			
 
				+
			
 
				+def load_config(config_path: str = "./config.yaml"):
			
 
				+    # Read the YAML configuration file
			
 
				+    with open(config_path, "r") as file:
			
 
				+        config = yaml.safe_load(file)
			
 
				+    # Set the API key from the environment variable
			
 
				+    try:
			
 
				+        config["api_key"] = os.environ["OCTOAI_API_TOKEN"]
			
 
				+    except KeyError:
			
 
				+        print("API token did not found, please set the OCTOAI_API_TOKEN environment variable if using OctoAI, otherwise set api_key to default EMPTY")
			
 
				+        # local Vllm endpoint did not need API key, so set the API key to "EMPTY" if OCTOAI_API_TOKEN not found
			
 
				+        config["api_key"] = "EMPTY"
			
 
				+    return config
			
 
				+
			
--- a/recipes/use_cases/end2end-recipes/raft/data/FAQ.md
+++ b/recipes/use_cases/end2end-recipes/raft/data/FAQ.md
@@ -0,0 +1,55 @@
 
				+# FAQ
			
 
				+
			
 
				+Here we discuss frequently asked questions that may occur and we found useful along the way.
			
 
				+
			
 
				+1. Does FSDP support mixed precision in one FSDP unit? Meaning, in one FSDP unit some of the parameters are in Fp16/Bf16 and others in FP32.
			
 
				+
			
 
				+    FSDP requires each FSDP unit to have consistent precision, so this case is not supported at this point. It might be added in future but no ETA at the moment.
			
 
				+
			
 
				+2.  How does FSDP handles mixed grad requirements?
			
 
				+
			
 
				+    FSDP does not support mixed `require_grad` in one FSDP unit. This means if you are planning to freeze some layers, you need to do it on the FSDP unit level rather than model layer. For example, let us assume our model has 30 decoder layers and we want to freeze the bottom 28 layers and only train 2 top transformer layers. In this case, we need to make sure `require_grad` for the top two transformer layers are set to `True`.
			
 
				+
			
 
				+3. How do PEFT methods work with FSDP in terms of grad requirements/layer freezing?
			
 
				+
			
 
				+    We wrap the PEFT modules separate from the transformer layer in auto_wrapping policy, that would result in PEFT models having `require_grad=True` while the rest of the model is  `require_grad=False`.
			
 
				+
			
 
				+4. Can I add custom datasets?
			
 
				+
			
 
				+    Yes, you can find more information on how to do that [here](Dataset.md).
			
 
				+
			
 
				+5. What are the hardware SKU requirements for deploying these models?
			
 
				+
			
 
				+    Hardware requirements vary based on latency, throughput and cost constraints. For good latency, the models were split across multiple GPUs with tensor parallelism in a machine with NVIDIA A100s or H100s. But TPUs, other types of GPUs like A10G, T4, L4, or even commodity hardware can also be used to deploy these models (e.g. https://github.com/ggerganov/llama.cpp).
			
 
				+    If working on a CPU, it is worth looking at this [blog post](https://www.intel.com/content/www/us/en/developer/articles/news/llama2.html) from Intel for an idea of Llama 2's performance on a CPU.
			
 
				+
			
 
				+6. What are the hardware SKU requirements for fine-tuning Llama pre-trained models?
			
 
				+
			
 
				+    Fine-tuning requirements vary based on amount of data, time to complete fine-tuning and cost constraints. To fine-tune these models we have generally used multiple NVIDIA A100 machines with data parallelism across nodes and a mix of data and tensor parallelism intra node. But using a single machine, or other GPU types like NVIDIA A10G or H100 are definitely possible (e.g. alpaca models are trained on a single RTX4090: https://github.com/tloen/alpaca-lora).
			
 
				+
			
 
				+7. How to handle CUDA memory fragmentations during fine-tuning that may lead into an OOM?
			
 
				+
			
 
				+    In some cases you may experience that after model checkpointing specially with FSDP (this usually does not happen with PEFT methods), the reserved and allocated CUDA memory has increased. This might be due to CUDA memory fragmentations. PyTorch recenly added an enviroment variable that helps to better manage memory fragmentation (this feature in available on PyTorch nightlies at the time of writing this doc July 30 2023). You can set this in your main training script as follows:
			
 
				+
			
 
				+    ```bash
			
 
				+
			
 
				+    os.environ['PYTORCH_CUDA_ALLOC_CONF']='expandable_segments:True'
			
 
				+
			
 
				+    ```
			
 
				+    We also added this enviroment variable in `setup_environ_flags` of the [train_utils.py](../src/llama_recipes/utils/train_utils.py), feel free to uncomment it if required.
			
 
				+
			
 
				+8. Additional debugging flags?
			
 
				+
			
 
				+    The environment variable `TORCH_DISTRIBUTED_DEBUG` can be used to trigger additional useful logging and collective synchronization checks to ensure all ranks are synchronized appropriately. `TORCH_DISTRIBUTED_DEBUG` can be set to either OFF (default), INFO, or DETAIL depending on the debugging level required. Please note that the most verbose option, DETAIL may impact the application performance and thus should only be used when debugging issues.
			
 
				+
			
 
				+    We also added this enviroment variable in `setup_environ_flags` of the [train_utils.py](../src/llama_recipes/utils/train_utils.py), feel free to uncomment it if required.
			
 
				+
			
 
				+9. I am getting import errors when running inference.
			
 
				+
			
 
				+    Verify that CUDA environment variables are set correctly on your machine. For example for bitsandbytes, you can generally set it as below to get things working on A100 80g's on AWS.
			
 
				+
			
 
				+    ```bash
			
 
				+    export CUDA_HOME="/usr/local/cuda-11.8"
			
 
				+    export PATH=$CUDA_HOME/bin:$PATH
			
 
				+    export LD_LIBRARY_PATH=$CUDA_HOME/lib:$CUDA_HOME/lib64:$CUDA_HOME/efa/lib:/opt/amazon/efa/lib:$LD_LIBRARY_PATH
			
 
				+    ```
			
--- a/recipes/use_cases/end2end-recipes/raft/doc_processor.py
+++ b/recipes/use_cases/end2end-recipes/raft/doc_processor.py
@@ -0,0 +1,47 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				+# This software may be used and distributed according to the terms of the Llama 3 Community License Agreement.
			
 
				+
			
 
				+# Assuming result_average_token is a constant, use UPPER_CASE for its name to follow Python conventions
			
 
				+AVERAGE_TOKENS_PER_RESULT = 100
			
 
				+
			
 
				+def get_token_limit_for_model(model: str) -> int:
			
 
				+    """Returns the token limit for a given model."""
			
 
				+    if model == "llama-2-13b-chat" or model == "llama-2-70b-chat":
			
 
				+        return 4096
			
 
				+    else:
			
 
				+        return 8192
			
 
				+
			
 
				+def calculate_num_tokens_for_message(encoded_text) -> int:
			
 
				+    """Calculates the number of tokens used by a message."""
			
 
				+    # Added 3 to account for priming with assistant's reply, as per original comment
			
 
				+    return len(encoded_text) + 3
			
 
				+
			
 
				+
			
 
				+def split_text_into_chunks(context: dict, text: str, tokenizer) -> list[str]:
			
 
				+    """Splits a long text into substrings based on token length constraints, adjusted for question generation."""
			
 
				+    # Adjusted approach to calculate max tokens available for text chunks
			
 
				+    encoded_text = tokenizer(text, return_tensors="pt", padding=True)["input_ids"]
			
 
				+    encoded_text = encoded_text.squeeze()
			
 
				+    model_token_limit = get_token_limit_for_model(context["model"])
			
 
				+
			
 
				+    tokens_for_questions = calculate_num_tokens_for_message(encoded_text)
			
 
				+    estimated_tokens_per_question = AVERAGE_TOKENS_PER_RESULT
			
 
				+    estimated_total_question_tokens = estimated_tokens_per_question * context["total_questions"]
			
 
				+    # Ensure there's a reasonable minimum chunk size
			
 
				+    max_tokens_for_text = max(model_token_limit - tokens_for_questions - estimated_total_question_tokens, model_token_limit // 10)
			
 
				+
			
 
				+    chunks, current_chunk = [], []
			
 
				+    print(f"Splitting text into chunks of {max_tokens_for_text} tokens, encoded_text {len(encoded_text)}", flush=True)
			
 
				+    for token in encoded_text:
			
 
				+        if len(current_chunk) >= max_tokens_for_text:
			
 
				+            chunks.append(tokenizer.decode(current_chunk).strip())
			
 
				+            current_chunk = []
			
 
				+        else:
			
 
				+            current_chunk.append(token)
			
 
				+
			
 
				+    if current_chunk:
			
 
				+        chunks.append(tokenizer.decode(current_chunk).strip())
			
 
				+
			
 
				+    print(f"Number of chunks in the processed text: {len(chunks)}", flush=True)
			
 
				+
			
 
				+    return chunks
			
--- a/recipes/use_cases/end2end-recipes/raft/format.py
+++ b/recipes/use_cases/end2end-recipes/raft/format.py
@@ -0,0 +1,173 @@
 
				+from abc import ABC, abstractmethod
			
 
				+import argparse
			
 
				+from datasets import Dataset, load_dataset
			
 
				+from typing import Dict, Literal, Any, get_args
			
 
				+
			
 
				+"""
			
 
				+This file allows to convert raw HuggingFace Datasets into files suitable to fine tune completion and chat models.
			
 
				+"""
			
 
				+
			
 
				+OutputDatasetType = Literal["parquet", "jsonl"]
			
 
				+outputDatasetTypes = list(get_args(OutputDatasetType))
			
 
				+
			
 
				+InputDatasetType = Literal["arrow", "jsonl"]
			
 
				+inputDatasetTypes = list(get_args(InputDatasetType))
			
 
				+
			
 
				+DatasetFormat = Literal["hf", "completion", "chat"]
			
 
				+datasetFormats = list(get_args(DatasetFormat))
			
 
				+
			
 
				+def get_args() -> argparse.Namespace:
			
 
				+    """
			
 
				+    Parses and returns the arguments specified by the user's command
			
 
				+    """
			
 
				+    parser = argparse.ArgumentParser()
			
 
				+
			
 
				+    parser.add_argument("--input", type=str, required=True, help="Input HuggingFace dataset file")
			
 
				+    parser.add_argument("--input-type", type=str, default="arrow", help="Format of the input dataset. Defaults to arrow.", choices=inputDatasetTypes)
			
 
				+    parser.add_argument("--output", type=str, required=True, help="Output file")
			
 
				+    parser.add_argument("--output-format", type=str, required=True, help="Format to convert the dataset to", choices=datasetFormats)
			
 
				+    parser.add_argument("--output-type", type=str, default="jsonl", help="Type to export the dataset to. Defaults to jsonl.", choices=outputDatasetTypes)
			
 
				+    parser.add_argument("--output-chat-system-prompt", type=str, help="The system prompt to use when the output format is chat")
			
 
				+
			
 
				+    args = parser.parse_args()
			
 
				+    return args
			
 
				+
			
 
				+class DatasetFormatter(ABC):
			
 
				+    """
			
 
				+    Base class for dataset formatters. Formatters rename columns, remove and add 
			
 
				+    columns to match the expected target format structure. HF, Chat or Completion models file formats.
			
 
				+    https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset
			
 
				+    """
			
 
				+    @abstractmethod
			
 
				+    def format(self, ds: Dataset, params: Dict[str, str]) -> Dataset:
			
 
				+        pass
			
 
				+
			
 
				+class DatasetExporter(ABC):
			
 
				+    """
			
 
				+    Base class for dataset exporters. Exporters export dataset to different file types, JSONL, Parquet, ...
			
 
				+    """
			
 
				+    @abstractmethod
			
 
				+    def export(self, ds: Dataset, output_path: str):
			
 
				+        pass
			
 
				+
			
 
				+class DatasetConverter():
			
 
				+    """
			
 
				+    Entry point class. It resolves which DatasetFormatter and which DatasetExporter to use and runs them.
			
 
				+    """
			
 
				+    formats: Dict[DatasetFormat, DatasetFormatter]
			
 
				+    exporters: Dict[OutputDatasetType, Any]
			
 
				+
			
 
				+    def __init__(self) -> None:
			
 
				+        self.formats = {
			
 
				+            "hf": HuggingFaceDatasetFormatter(),
			
 
				+            "completion": OpenAiCompletionDatasetFormatter(),
			
 
				+            "chat": OpenAiChatDatasetFormatter()
			
 
				+        }
			
 
				+        self.exporters = {
			
 
				+            "parquet": ParquetDatasetExporter(),
			
 
				+            "jsonl": JsonlDatasetExporter()
			
 
				+        }
			
 
				+
			
 
				+    def convert(self, ds: Dataset, format: DatasetFormat, output_path: str, output_type: OutputDatasetType, params: Dict[str, str]):
			
 
				+        if not format in self.formats:
			
 
				+            raise Exception(f"Output Format {format} is not supported, pleased select one of {self.formats.keys()}")
			
 
				+        
			
 
				+        if not output_type in self.exporters:
			
 
				+            raise Exception(f"Output Type {output_type} is not supported, pleased select one of {self.exporters.keys()}")
			
 
				+
			
 
				+        formatter = self.formats[format]
			
 
				+        newds = formatter.format(ds, params)
			
 
				+        exporter = self.exporters[output_type]
			
 
				+        exporter.export(newds, output_path)
			
 
				+
			
 
				+class HuggingFaceDatasetFormatter(DatasetFormatter):
			
 
				+    """
			
 
				+    Returns the HuggingFace Dataset as is
			
 
				+    """
			
 
				+    def format(self, ds: Dataset, params: Dict[str, str]) -> Dataset:
			
 
				+        return ds
			
 
				+
			
 
				+def _remove_all_columns_but(ds: Dataset, keep_columns) -> Dataset:
			
 
				+    """
			
 
				+    HF Dataset doesn't have a way to copy only specific columns of a Dataset so this help
			
 
				+    removes all columns but the ones specified.
			
 
				+    """
			
 
				+    remove_columns = list(ds.column_names)
			
 
				+    for keep in keep_columns:
			
 
				+        remove_columns.remove(keep)
			
 
				+    ds = ds.remove_columns(remove_columns)
			
 
				+    return ds
			
 
				+
			
 
				+class OpenAiCompletionDatasetFormatter(DatasetFormatter):
			
 
				+    """
			
 
				+    Returns the Dataset in the OpenAI Completion Fine-tuning file format with two fields "prompt" and "completion".
			
 
				+    https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset
			
 
				+    """
			
 
				+    def format(self, ds: Dataset, params: Dict[str, str]) -> Dataset:
			
 
				+        newds = ds.rename_columns({'question': 'prompt', 'cot_answer': 'completion'})
			
 
				+        return _remove_all_columns_but(newds, ['prompt', 'completion'])
			
 
				+
			
 
				+class OpenAiChatDatasetFormatter(OpenAiCompletionDatasetFormatter):
			
 
				+    """
			
 
				+    Returns the Dataset in the OpenAI Chat Fine-tuning file format with one field "messages".
			
 
				+    https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset
			
 
				+    """
			
 
				+    def format(self, ds: Dataset, params: Dict[str, str]) -> Dataset:
			
 
				+        newds = super().format(ds, params)
			
 
				+
			
 
				+        def format_messages(row):
			
 
				+            messages = []
			
 
				+            if 'system_prompt' in params:
			
 
				+                system_prompt = params['system_prompt']
			
 
				+                messages.append({ "role": "system", "content": system_prompt})
			
 
				+            messages.extend([{ "role": "user", "content": row['prompt']}, { "role": "assistant", "content": row['completion']}])
			
 
				+            chat_row = {"messages": messages}
			
 
				+            return chat_row
			
 
				+
			
 
				+        newds = newds.map(format_messages)
			
 
				+        return _remove_all_columns_but(newds, ['messages'])
			
 
				+
			
 
				+def append_extension(path: str, extension: str) -> str:
			
 
				+    suffix = "." + extension
			
 
				+    if not path.endswith(suffix):
			
 
				+        path = path + suffix
			
 
				+    return path
			
 
				+
			
 
				+
			
 
				+class JsonlDatasetExporter(DatasetExporter):
			
 
				+    """
			
 
				+    Exports the Dataset to a JSONL file
			
 
				+    """
			
 
				+
			
 
				+    def export(self, ds: Dataset, output_path: str):
			
 
				+        ds.to_json(append_extension(output_path, "jsonl"))
			
 
				+
			
 
				+
			
 
				+class ParquetDatasetExporter(DatasetExporter):
			
 
				+    """
			
 
				+    Exports the Dataset to a Parquet file
			
 
				+    """
			
 
				+
			
 
				+    def export(self, ds: Dataset, output_path: str):
			
 
				+        ds.to_parquet(append_extension(output_path, "parquet"))
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    """
			
 
				+    When raft.py is executed from the command line.
			
 
				+    """
			
 
				+    args = get_args()
			
 
				+    ds = load_dataset(args.input_type, data_files={"train": args.input})['train']
			
 
				+    formatter = DatasetConverter()
			
 
				+
			
 
				+    if args.output_chat_system_prompt and args.output_format != "chat":
			
 
				+        raise Exception("Parameter --output-chat-system-prompt can only be used with --output-format chat")
			
 
				+
			
 
				+    format_params = {}
			
 
				+    if args.output_chat_system_prompt:
			
 
				+        format_params['system_prompt'] = args.output_chat_system_prompt
			
 
				+
			
 
				+    formatter.convert(ds=ds, format=args.output_format, output_path=args.output, output_type=args.output_type, params=format_params)
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/recipes/use_cases/end2end-recipes/raft/raft.py
+++ b/recipes/use_cases/end2end-recipes/raft/raft.py
@@ -0,0 +1,106 @@
 
				+import mdc
			
 
				+from mdc import MDC
			
 
				+import logging
			
 
				+from typing import Literal, Any
			
 
				+from openai import OpenAI
			
 
				+import datasets
			
 
				+from datasets import Dataset, load_dataset
			
 
				+import json
			
 
				+import random
			
 
				+import os, shutil
			
 
				+import argparse
			
 
				+import asyncio
			
 
				+from raft_utils import generate_questions, add_chunk_to_dataset
			
 
				+from chat_utils import OctoAIChatService, VllmChatService
			
 
				+from format import DatasetConverter, datasetFormats, outputDatasetTypes
			
 
				+from config import load_config
			
 
				+
			
 
				+# def generate_label(client: OpenAI, question: str, context: Any, doctype: DocType = "pdf", model: str = None) -> str | None:
			
 
				+#     """
			
 
				+#     Generates the label / answer to `question` using `context` and GPT-4.
			
 
				+#     """
			
 
				+#     question = encode_question(question, context) if doctype == "api" else encode_question_gen(question, context)
			
 
				+#     response = client.chat.completions.create(
			
 
				+#         model=model,
			
 
				+#         messages=question,
			
 
				+#         n=1,
			
 
				+#         temperature=0
			
 
				+#     )
			
 
				+#     response = response.choices[0].message.content
			
 
				+#     return response
			
 
				+# Configure logging to include the timestamp, log level, and message
			
 
				+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
			
 
				+
			
 
				+
			
 
				+async def main(context):
			
 
				+    if context["endpoint"]:
			
 
				+        chat_service = VllmChatService()
			
 
				+    else:
			
 
				+        chat_service = OctoAIChatService()
			
 
				+    try:
			
 
				+        logging.info("Starting to generate question pair.")
			
 
				+        # Generate question/answer pairs as list
			
 
				+        chunks = await generate_questions(chat_service, context)
			
 
				+        if not chunks:
			
 
				+            logging.warning("No questions generated from text. Please check the input context or model configuration.")
			
 
				+            return
			
 
				+        logging.info(f"Successfully generated {sum([len(q) for q in chunks])} question/answer pairs.")
			
 
				+        print(chunks)
			
 
				+        for i, chunk in enumerate(chunks):
			
 
				+            perc = ceil(i / num_chunks * 100)
			
 
				+            with MDC(progress=f"{perc}%"):
			
 
				+                logger.info(f"Adding chunk {i}/{num_chunks}")
			
 
				+                add_chunk_to_dataset(client, chunks, chunk, args.doctype, args.questions, NUM_DISTRACT_DOCS, model=args.completion_model)
			
 
				+
			
 
				+        logging.info(f"Data successfully written to {context['output']}. Process completed.")
			
 
				+    except Exception as e:
			
 
				+        logging.error(f"An unexpected error occurred during the process: {e}",exc_info=True)
			
 
				+
			
 
				+def parse_arguments():
			
 
				+    # Define command line arguments for the script
			
 
				+    parser = argparse.ArgumentParser(
			
 
				+        description="Generate question/answer pairs from documentation."
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "-t", "--questions_per_chunk",
			
 
				+        type=int,
			
 
				+        default=3,
			
 
				+        help="Specify the number of question pairs to generate per chunk."
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "-m", "--model",
			
 
				+        choices=["meta-llama-3-70b-instruct","meta-llama-3-8b-instruct","llama-2-13b-chat", "llama-2-70b-chat"],
			
 
				+        default="meta-llama-3-70b-instruct",
			
 
				+        help="Select the model to use for generation."
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "-c", "--config_path",
			
 
				+        default="./raft.yaml",
			
 
				+        help="Set the configuration file path that has system prompt along with language, dataset path and number of questions."
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "-v", "--vllm_endpoint",
			
 
				+        default=None,
			
 
				+        type=int,
			
 
				+        help="If a port is specified, then use local vllm endpoint for generating question/answer pairs."
			
 
				+    )
			
 
				+    parser.add_argument("--chunk_size", type=int, default=512, help="The size of each chunk in number of tokens")
			
 
				+    parser.add_argument("-o","--output", type=str, default="./", help="The path at which to save the dataset")
			
 
				+    parser.add_argument("--output-format", type=str, default="hf", help="Format to convert the dataset to. Defaults to hf.", choices=datasetFormats)
			
 
				+    parser.add_argument("--output-type", type=str, default="jsonl", help="Type to export the dataset to. Defaults to jsonl.", choices=outputDatasetTypes)
			
 
				+    return parser.parse_args()
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    logging.info("Initializing the process and loading configuration...")
			
 
				+    args = parse_arguments()
			
 
				+
			
 
				+    context = load_config(args.config_path)
			
 
				+    context["questions_per_chunk"] = args.questions_per_chunk
			
 
				+    context["model"] = args.model
			
 
				+    context["chunk_size"] = args.chunk_size
			
 
				+    context["endpoint"] = args.vllm_endpoint
			
 
				+    context["output"] = args.output
			
 
				+    logging.info(f"Configuration loaded. Generating {args.questions_per_chunk} question per chunk using model '{args.model}'.")
			
 
				+    if context["endpoint"]:
			
 
				+        logging.info(f"Use local vllm service at port: '{args.vllm_endpoint}'.")
			
 
				+    asyncio.run(main(context))
			
--- a/recipes/use_cases/end2end-recipes/raft/raft.yaml
+++ b/recipes/use_cases/end2end-recipes/raft/raft.yaml
@@ -0,0 +1,20 @@
 
				+COT_prompt_template: >
			
 
				+  Question: {question}\nContext: {context}\n
			
 
				+        Answer this question using the information given in the context above. Here is things to pay attention to:
			
 
				+        - First provide step-by-step reasoning on how to answer the question.
			
 
				+        - In the reasoning, if you need to copy paste some sentences from the context, include them in ##begin_quote## and ##end_quote##. This would mean that things outside of ##begin_quote## and ##end_quote## are not directly copy paste from the context.
			
 
				+        - End your response with final answer in the form <ANSWER>: $answer, the answer should be succinct.
			
 
				+        You MUST begin your final answer with the tag "<ANSWER>:
			
 
				+
			
 
				+question_prompt_template: >
			
 
				+  You are a synthetic question-answer pair generator. Given a chunk of context about
			
 
				+  some topic(s), generate {num_questions} example questions a user could ask and would be answered
			
 
				+  \using information from the chunk. For example, if the given context was a Wikipedia
			
 
				+  paragraph about the United States, an example question could be 'How many states are
			
 
				+  in the United States?
			
 
				+  The questions should be able to be answered in a few words or less. Include only the
			
 
				+  questions in your response.
			
 
				+
			
 
				+data_dir: "./data"
			
 
				+
			
 
				+num_questions: 2
			
--- a/recipes/use_cases/end2end-recipes/raft/raft_utils.py
+++ b/recipes/use_cases/end2end-recipes/raft/raft_utils.py
@@ -0,0 +1,271 @@
 
				+# Copyright (c) Meta Platforms, Inc. and affiliates.
			
 
				+# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
			
 
				+
			
 
				+import os
			
 
				+import re
			
 
				+import string
			
 
				+from transformers import  AutoTokenizer
			
 
				+import asyncio
			
 
				+import magic
			
 
				+from PyPDF2 import PdfReader
			
 
				+import json
			
 
				+from doc_processor import split_text_into_chunks
			
 
				+import logging
			
 
				+import json
			
 
				+from langchain.embeddings import HuggingFaceEmbeddings
			
 
				+from langchain_experimental.text_splitter import SemanticChunker
			
 
				+from math import ceil
			
 
				+import random
			
 
				+# Initialize logging
			
 
				+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
			
 
				+def strip_str(s: str) -> str:
			
 
				+    """
			
 
				+    Helper function for helping format strings returned by GPT-4.
			
 
				+    """
			
 
				+    l, r = 0, len(s)-1
			
 
				+    beg_found = False
			
 
				+    for i in range(len(s)):
			
 
				+        if s[i].isalpha():
			
 
				+            if not beg_found:
			
 
				+                l = i
			
 
				+                beg_found = True
			
 
				+            else:
			
 
				+                r = i
			
 
				+    r += 2
			
 
				+    return s[l:min(r, len(s))]
			
 
				+def read_text_file(file_path):
			
 
				+    try:
			
 
				+        with open(file_path, 'r') as f:
			
 
				+            text = f.read().strip() + ' '
			
 
				+            if len(text) == 0:
			
 
				+                print("File is empty ",file_path)
			
 
				+            return text
			
 
				+    except Exception as e:
			
 
				+        logging.error(f"Error reading text file {file_path}: {e}")
			
 
				+    return ''
			
 
				+
			
 
				+def read_pdf_file(file_path):
			
 
				+    try:
			
 
				+        with open(file_path, 'rb') as f:
			
 
				+            pdf_reader = PdfReader(f)
			
 
				+            num_pages = len(pdf_reader.pages)
			
 
				+            file_text = [pdf_reader.pages[page_num].extract_text().strip() + ' ' for page_num in range(num_pages)]
			
 
				+            text = ''.join(file_text)
			
 
				+            if len(text) == 0:
			
 
				+                print("File is empty ",file_path)
			
 
				+            return ''.join(file_text)
			
 
				+    except Exception as e:
			
 
				+        logging.error(f"Error reading PDF file {file_path}: {e}")
			
 
				+    return ''
			
 
				+
			
 
				+def read_json_file(file_path):
			
 
				+    try:
			
 
				+        with open(file_path, 'r') as f:
			
 
				+            data = json.load(f)
			
 
				+            # Assuming each item in the list has a 'question' and 'answer' key
			
 
				+            # Concatenating question and answer pairs with a space in between and accumulating them into a single string
			
 
				+            file_text = ' '.join([item['question'].strip() + ' ' + item['answer'].strip() + ' ' for item in data])
			
 
				+            if len(file_text) == 0:
			
 
				+                print("File is empty ",file_path)
			
 
				+            return file_text
			
 
				+    except Exception as e:
			
 
				+        logging.error(f"Error reading JSON file {file_path}: {e}")
			
 
				+    return ''
			
 
				+
			
 
				+
			
 
				+def process_file(file_path):
			
 
				+    print("starting to process file: ", file_path)
			
 
				+    file_type = magic.from_file(file_path, mime=True)
			
 
				+    if file_type in ['text/plain', 'text/markdown', 'JSON']:
			
 
				+        return read_text_file(file_path)
			
 
				+    elif file_type == 'application/pdf':
			
 
				+        return read_pdf_file(file_path)
			
 
				+    else:
			
 
				+        logging.warning(f"Unsupported file type {file_type} for file {file_path}")
			
 
				+        return ''
			
 
				+def read_file_content(context):
			
 
				+    file_strings = []
			
 
				+
			
 
				+    for root, _, files in os.walk(context['data_dir']):
			
 
				+        for file in files:
			
 
				+            file_path = os.path.join(root, file)
			
 
				+            file_text = process_file(file_path)
			
 
				+            if file_text:
			
 
				+                file_strings.append(file_text)
			
 
				+    text = '\n'.join(file_strings)
			
 
				+    text = remove_non_printable(text)
			
 
				+    return remove_non_printable(text)
			
 
				+
			
 
				+def remove_non_printable(s):
			
 
				+    printable = set(string.printable)
			
 
				+    return ''.join(filter(lambda x: x in printable, s))
			
 
				+
			
 
				+
			
 
				+async def generate_question_request(chat_service, api_context: dict, document_content: str, num_questions: int) -> dict:
			
 
				+    if num_questions == 0:
			
 
				+        logging.info(f"Error: num_questions is 0")
			
 
				+        return {}
			
 
				+    prompt_for_system = api_context['question_prompt_template'].format(num_questions=num_questions)
			
 
				+    chat_request_payload = [{'role': 'system', 'content': prompt_for_system}, {'role': 'user', 'content': str(document_content)}]
			
 
				+    # parse the result string to a list of dict that has Question, Answer, Context
			
 
				+    return await chat_service.execute_chat_request_async(api_context, chat_request_payload)
			
 
				+
			
 
				+def get_chunks(
			
 
				+    text: str,
			
 
				+    chunk_size: int = 512,
			
 
				+    embedding_model: str = None
			
 
				+) -> list[str]:
			
 
				+    """
			
 
				+    Takes in a `file_path` and `doctype`, retrieves the document, breaks it down into chunks of size
			
 
				+    `chunk_size`, and returns the chunks.
			
 
				+    """
			
 
				+    chunks = []
			
 
				+    if  len(text) == 0:
			
 
				+        raise TypeError("Can not get chunks from empty text")
			
 
				+    else:
			
 
				+        num_chunks = ceil(len(text) / chunk_size)
			
 
				+        logging.info(f"Splitting text into {num_chunks} chunks")
			
 
				+        text_splitter = SemanticChunker(embedding_model, number_of_chunks=num_chunks)
			
 
				+        chunks = text_splitter.create_documents([text])
			
 
				+        chunks = [chunk.page_content for chunk in chunks]
			
 
				+
			
 
				+    return chunks
			
 
				+# read all the files in the data folder, then split them into chunks
			
 
				+# generate questions for each chunk and return a list of questions list
			
 
				+async def generate_questions(chat_service, api_context: dict):
			
 
				+    document_text = read_file_content(api_context)
			
 
				+    if len(document_text)== 0:
			
 
				+        logging.error(f"Error reading files, document_text is empty")
			
 
				+    model_name = "sentence-transformers/all-mpnet-base-v2"
			
 
				+    embedding_model = HuggingFaceEmbeddings(model_name=model_name)
			
 
				+    document_batches = get_chunks(document_text,api_context["chunk_size"],embedding_model)
			
 
				+
			
 
				+    batches_count = len(document_batches)
			
 
				+    total_questions = api_context["questions_per_chunk"] * batches_count
			
 
				+
			
 
				+    print(f"Questions per batch: {api_context['questions_per_chunk']}, Total questions: {total_questions}, Batches: {batches_count}")
			
 
				+    generation_tasks = []
			
 
				+    for batch_index, batch_content in enumerate(document_batches):
			
 
				+        print(f"len of batch_content: {len(batch_content)}, batch_index: {batch_index}")
			
 
				+        #Distribute extra questions across the first few batches
			
 
				+        print(f"Batch {batch_index + 1} - {api_context['questions_per_chunk']} questions ********")
			
 
				+        try:
			
 
				+            task = generate_question_request(chat_service, api_context, batch_content, api_context["questions_per_chunk"])
			
 
				+            generation_tasks.append(task)
			
 
				+        except Exception as e:
			
 
				+            print(f"Error during chat request execution: {e}")
			
 
				+
			
 
				+    question_generation_results = await asyncio.gather(*generation_tasks)
			
 
				+    final_result = []
			
 
				+    for result in question_generation_results:
			
 
				+        queries = result.split('\n')
			
 
				+        queries = [strip_str(q) for q in queries]
			
 
				+        queries = [q for q in queries if any(c.isalpha() for c in q)]
			
 
				+        if len(queries) > int(api_context['questions_per_chunk']):
			
 
				+            # As the model may have unrelated question at the begining of the result
			
 
				+            # if queries is more than questions_per_chunk, then we need to truncate it and only keep last questions_per_chunk lines
			
 
				+            queries = queries[-int(api_context['questions_per_chunk']):]
			
 
				+        final_result.append(queries)
			
 
				+    return final_result
			
 
				+
			
 
				+def add_chunk_to_dataset(
			
 
				+    client: None,
			
 
				+    chunks: list[str],
			
 
				+    chunk: str,
			
 
				+    x: int = 5,
			
 
				+    num_distract: int = 3,
			
 
				+    p: float = 0.8,
			
 
				+    model: str = None
			
 
				+) -> None:
			
 
				+    """
			
 
				+    Given a chunk, create {Q, A, D} triplets and add them to the dataset.
			
 
				+    """
			
 
				+    global ds
			
 
				+    i = chunks.index(chunk)
			
 
				+    qs = generate_instructions(client, chunk, x, model) if doctype == "api" else generate_instructions_gen(client, chunk, x, model)
			
 
				+    for q in qs:
			
 
				+        datapt = {
			
 
				+            "id": None,
			
 
				+            "type": None,
			
 
				+            "question": None,
			
 
				+            "context": None,
			
 
				+            "oracle_context": None,
			
 
				+            "cot_answer": None
			
 
				+        }
			
 
				+
			
 
				+        datapt["id"] = f"seed_task_{0 if not ds else ds.num_rows}"
			
 
				+        datapt["type"] = "api call" if doctype == "api" else "general"
			
 
				+        datapt["question"] = q
			
 
				+
			
 
				+        # add num_distract distractor docs
			
 
				+        docs = [chunk]
			
 
				+        indices = list(range(0, len(chunks)))
			
 
				+        indices.remove(i)
			
 
				+        for j in random.sample(indices, num_distract):
			
 
				+            docs.append(chunks[j])
			
 
				+        # decides whether to add oracle document
			
 
				+        oracle = random.uniform(0, 1) < p
			
 
				+        if not oracle:
			
 
				+            docs[0] = chunks[random.sample(indices, 1)[0]]
			
 
				+        random.shuffle(docs)
			
 
				+
			
 
				+        d = {
			
 
				+            "title": [],
			
 
				+            "sentences": []
			
 
				+        }
			
 
				+
			
 
				+        d["title"].append(["placeholder_title"]*(num_distract+1))
			
 
				+        d["sentences"].append(docs)
			
 
				+        datapt["context"] = d
			
 
				+        datapt["oracle_context"] = chunk
			
 
				+
			
 
				+        # add answer to q
			
 
				+        datapt["cot_answer"] = generate_label(client, q, chunk, doctype, model=model)
			
 
				+
			
 
				+        # construct model instruction
			
 
				+        context = ""
			
 
				+        for doc in docs:
			
 
				+            context += "<DOCUMENT>" + str(doc) + "</DOCUMENT>\n"
			
 
				+        context += q
			
 
				+        datapt["instruction"] = context
			
 
				+
			
 
				+        # add to dataset
			
 
				+        if not ds:
			
 
				+            # init ds
			
 
				+            datapt["id"] = [datapt["id"]]
			
 
				+            datapt["type"] = [datapt["type"]]
			
 
				+            datapt["question"] = [datapt["question"]]
			
 
				+            datapt["context"] = [datapt["context"]]
			
 
				+            datapt["oracle_context"] = [datapt["oracle_context"]]
			
 
				+            datapt["cot_answer"] = [datapt["cot_answer"]]
			
 
				+            datapt["instruction"] = [datapt["instruction"]]
			
 
				+            ds = Dataset.from_dict(datapt)
			
 
				+        else:
			
 
				+            ds = ds.add_item(datapt)
			
 
				+
			
 
				+# This function is used to evaluate the quality of generated QA pairs. Return the original QA pair if the model eval result is YES. Otherwise, return an empty dict.
			
 
				+async def LLM_judge_request(chat_service, api_context: dict, document_content: dict) -> dict:
			
 
				+    prompt_for_system = api_context['judge_prompt_template'].format(language=api_context["language"])
			
 
				+    chat_request_payload = [{'role': 'system', 'content': prompt_for_system}, {'role': 'user', 'content': f"Question: {document_content['Question']} \n Teacher's Answer: {document_content['Ground_truth']}\n Student's Answer: {document_content['Generated_answer']} "}]
			
 
				+    result = await chat_service.execute_chat_request_async(api_context, chat_request_payload)
			
 
				+    if not result:
			
 
				+        return {}
			
 
				+    # no parsing needed, just return the loads the result as a dict
			
 
				+    result = json.loads(result)
			
 
				+    if "Result" not in result:
			
 
				+        print("Error: eval response does not contain answer")
			
 
				+        print(document_content,result)
			
 
				+        return {}
			
 
				+    return result
			
 
				+
			
 
				+async def generate_LLM_eval(chat_service, api_context: dict, judge_list: list):
			
 
				+    eval_tasks = []
			
 
				+    for batch_index, batch_content in enumerate(judge_list):
			
 
				+        try:
			
 
				+            result = LLM_judge_request(chat_service, api_context, batch_content)
			
 
				+            eval_tasks.append(result)
			
 
				+        except Exception as e:
			
 
				+            print(f"Error during data eval request execution: {e}")
			
 
				+
			
 
				+    judge_results = await asyncio.gather(*eval_tasks)
			
 
				+    return judge_results