před 2 roky · f963bb8254
--- a/recipes/use_cases/end2end-recipes/raft/README.md
+++ b/recipes/use_cases/end2end-recipes/raft/README.md
--- a/recipes/use_cases/end2end-recipes/raft/data/website_data
+++ b/recipes/use_cases/end2end-recipes/raft/data/website_data
--- a/recipes/use_cases/end2end-recipes/raft/data_urls.xml
+++ b/recipes/use_cases/end2end-recipes/raft/data_urls.xml
@@ -126,6 +126,39 @@
 
				 <loc>http://raw.githubusercontent.com/meta-llama/PurpleLlama/main/Llama-Guard/MODEL_CARD.md</loc>
			
 
				 </url>
			
 
				 <url>
			
 
				-<loc>http://raw.githubusercontent.com/meta-llama/PurpleLlama/main/Llama-Guard/README.md</loc>
			
 
				+<loc>https://hamel.dev/notes/llm/inference/03_inference.html</loc>
			
 
				+</url>
			
 
				+<url>
			
 
				+<loc>https://www.anyscale.com/blog/continuous-batching-llm-inference</loc>
			
 
				+</url>
			
 
				+<url>
			
 
				+<loc>https://github.com/huggingface/peft</loc>
			
 
				+</url><url>
			
 
				+<loc>https://github.com/facebookresearch/llama-recipes/blob/main/docs/LLM_finetuning.md</loc>
			
 
				+</url>
			
 
				+<url>
			
 
				+<loc>https://github.com/meta-llama/llama-recipes/blob/main/recipes/finetuning/datasets/README.md</loc>
			
 
				+</url><url>
			
 
				+<loc>https://www.databricks.com/blog/efficient-fine-tuning-lora-guide-llms</loc>
			
 
				+</url>
			
 
				+<url>
			
 
				+<loc>https://www.wandb.courses/courses/training-fine-tuning-LLMs</loc>
			
 
				+</url>
			
 
				+<url>
			
 
				+<loc>https://www.snowflake.com/blog/meta-code-llama-testing/</loc>
			
 
				+</url><url>
			
 
				+<loc>https://www.phind.com/blog/code-llama-beats-gpt4</loc>
			
 
				+</url>
			
 
				+<loc>https://www.anyscale.com/blog/llama-2-is-about-as-factually-accurate-as-gpt-4-for-summaries-and-is-30x-cheaper</loc>
			
 
				+</url>
			
 
				+<url>
			
 
				+<loc>https://ragntune.com/blog/gpt3.5-vs-llama2-finetuning</loc>
			
 
				+</url><url>
			
 
				+<loc>https://deci.ai/blog/fine-tune-llama-2-with-lora-for-question-answering/</loc>
			
 
				+</url>
			
 
				+<url>
			
 
				+<loc>https://replicate.com/blog/fine-tune-translation-model-axolotl</loc>
			
 
				+</url><url>
			
 
				+<loc>https://huyenchip.com/2023/04/11/llm-engineering.html</loc>
			
 
				 </url>
			
 
				 </urlset>
			
--- a/recipes/use_cases/end2end-recipes/raft/eval_config.yaml
+++ b/recipes/use_cases/end2end-recipes/raft/eval_config.yaml
@@ -1,51 +0,0 @@
 
				-eval_prompt_template: >
			
 
				-  <|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a AI assistant that skilled in answering questions related to Llama language models,
			
 
				-  which includes LLama, Llama2, Meta Llama3, Code Llama, Meta Llama Guard 1,	Meta Llama Guard 2,
			
 
				-  Below is a question from a llama user, please the answer it with best of your knowledge,
			
 
				-  The returned answer should be no more than 100 words.Please return the answers in text directly without any special tokens.<|eot_id|>
			
 
				-  <|start_header_id|>user<|end_header_id|>
			
 
				-  Question:{question} \n <|eot_id|><|start_header_id|>assistant<|end_header_id|>
			
 
				-# judge_prompt_template: >
			
 
				-#   <|begin_of_text|><|start_header_id|>system<|end_header_id|>You have been provided with a question, a teacher's answer and a student's answer above. Given that question, you need to score the how good the student answer is compare to
			
 
				-#   the teacher's answer. If the student's answer is correct based on the teacher's answer, then return YES, else return NO.
			
 
				-#   Review it carefully to make sure that the keywords and numerical vaules are exactly the same.
			
 
				-#   Only respond with "YES" or "NO", do not respond with anything else.<|eot_id|>
			
 
				-#   <|start_header_id|>user<|end_header_id|>
			
 
				-#   Question: {question} \n Teacher's Answer: {gold} \n Student's Answer: {prediction} <|eot_id|><|start_header_id|>assistant<|end_header_id|>
			
 
				-judge_prompt_template: >
			
 
				-    <|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a teacher grading a quiz.
			
 
				-
			
 
				-    You will be given a QUESTION, the GROUND TRUTH (correct) ANSWER, and the STUDENT ANSWER.
			
 
				-
			
 
				-    Here is the grade criteria to follow:
			
 
				-    (1) Grade the student answers based ONLY on their factual accuracy relative to the ground truth answer.
			
 
				-    (2) Ensure that the student answer does not contain any conflicting statements.
			
 
				-    (3) It is OK if the student answer contains more information than the ground truth answer, as long as it is factually accurate relative to the  ground truth answer.
			
 
				-
			
 
				-    Score:
			
 
				-    YES means that the student's answer meets all of the criteria. This is the highest (best) score.
			
 
				-    NO means that the student's answer does not meet all of the criteria. This is the lowest possible score you can give.
			
 
				-
			
 
				-    Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct.
			
 
				-
			
 
				-    Avoid simply stating the correct answer at the outset.
			
 
				-    End your response with final answer in the form <ANSWER>: $answer, answer must be YES or NO  <|eot_id|>
			
 
				-    <|start_header_id|>user<|end_header_id|>
			
 
				-    QUESTION: {{question}}
			
 
				-    GROUND TRUTH ANSWER: {{gold}}
			
 
				-    STUDENT ANSWER: {{prediction}}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
			
 
				-RAG_prompt_template: >
			
 
				-  <|begin_of_text|><|start_header_id|>system<|end_header_id|> Answer the following question using the information given in the context below. Here is things to pay attention to:
			
 
				-    - First provide step-by-step reasoning on how to answer the question.
			
 
				-    - In the reasoning, if you need to copy paste some sentences from the context, include them in ##begin_quote## and ##end_quote##. This would mean that things outside of ##begin_quote## and ##end_quote## are not directly copy paste from the context.
			
 
				-    - End your response with final answer in the form <ANSWER>: $answer, the answer should less than 60 words.
			
 
				-    You MUST begin your final answer with the tag "<ANSWER>:<|eot_id|>
			
 
				-  <|start_header_id|>user<|end_header_id|>
			
 
				-  Question: {question}\nContext: {context}\n<|eot_id|><|start_header_id|>assistant<|end_header_id|>
			
 
				-eval_json: "./evalset.json"
			
 
				-
			
 
				-raft_model_name: "raft-8b"
			
 
				-
			
 
				-base_model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
			
 
				-
			
 
				-data_dir: "./data"
			
--- a/recipes/use_cases/end2end-recipes/raft/eval_llama.json
+++ b/recipes/use_cases/end2end-recipes/raft/eval_llama.json
--- a/recipes/use_cases/end2end-recipes/raft/evalset.json
+++ b/recipes/use_cases/end2end-recipes/raft/evalset.json
--- a/recipes/use_cases/end2end-recipes/raft/format.py
+++ b/recipes/use_cases/end2end-recipes/raft/format.py
@@ -1,173 +0,0 @@
 
				-from abc import ABC, abstractmethod
			
 
				-import argparse
			
 
				-from datasets import Dataset, load_dataset
			
 
				-from typing import Dict, Literal, Any, get_args
			
 
				-
			
 
				-"""
			
 
				-This file allows to convert raw HuggingFace Datasets into files suitable to fine tune completion and chat models.
			
 
				-"""
			
 
				-
			
 
				-OutputDatasetType = Literal["parquet", "jsonl"]
			
 
				-outputDatasetTypes = list(get_args(OutputDatasetType))
			
 
				-
			
 
				-InputDatasetType = Literal["arrow", "jsonl"]
			
 
				-inputDatasetTypes = list(get_args(InputDatasetType))
			
 
				-
			
 
				-DatasetFormat = Literal["hf", "completion", "chat"]
			
 
				-datasetFormats = list(get_args(DatasetFormat))
			
 
				-
			
 
				-def get_args() -> argparse.Namespace:
			
 
				-    """
			
 
				-    Parses and returns the arguments specified by the user's command
			
 
				-    """
			
 
				-    parser = argparse.ArgumentParser()
			
 
				-
			
 
				-    parser.add_argument("--input", type=str, required=True, help="Input HuggingFace dataset file")
			
 
				-    parser.add_argument("--input-type", type=str, default="arrow", help="Format of the input dataset. Defaults to arrow.", choices=inputDatasetTypes)
			
 
				-    parser.add_argument("--output", type=str, required=True, help="Output file")
			
 
				-    parser.add_argument("--output-format", type=str, required=True, help="Format to convert the dataset to", choices=datasetFormats)
			
 
				-    parser.add_argument("--output-type", type=str, default="jsonl", help="Type to export the dataset to. Defaults to jsonl.", choices=outputDatasetTypes)
			
 
				-    parser.add_argument("--output-chat-system-prompt", type=str, help="The system prompt to use when the output format is chat")
			
 
				-
			
 
				-    args = parser.parse_args()
			
 
				-    return args
			
 
				-
			
 
				-class DatasetFormatter(ABC):
			
 
				-    """
			
 
				-    Base class for dataset formatters. Formatters rename columns, remove and add 
			
 
				-    columns to match the expected target format structure. HF, Chat or Completion models file formats.
			
 
				-    https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset
			
 
				-    """
			
 
				-    @abstractmethod
			
 
				-    def format(self, ds: Dataset, params: Dict[str, str]) -> Dataset:
			
 
				-        pass
			
 
				-
			
 
				-class DatasetExporter(ABC):
			
 
				-    """
			
 
				-    Base class for dataset exporters. Exporters export dataset to different file types, JSONL, Parquet, ...
			
 
				-    """
			
 
				-    @abstractmethod
			
 
				-    def export(self, ds: Dataset, output_path: str):
			
 
				-        pass
			
 
				-
			
 
				-class DatasetConverter():
			
 
				-    """
			
 
				-    Entry point class. It resolves which DatasetFormatter and which DatasetExporter to use and runs them.
			
 
				-    """
			
 
				-    formats: Dict[DatasetFormat, DatasetFormatter]
			
 
				-    exporters: Dict[OutputDatasetType, Any]
			
 
				-
			
 
				-    def __init__(self) -> None:
			
 
				-        self.formats = {
			
 
				-            "hf": HuggingFaceDatasetFormatter(),
			
 
				-            "completion": OpenAiCompletionDatasetFormatter(),
			
 
				-            "chat": OpenAiChatDatasetFormatter()
			
 
				-        }
			
 
				-        self.exporters = {
			
 
				-            "parquet": ParquetDatasetExporter(),
			
 
				-            "jsonl": JsonlDatasetExporter()
			
 
				-        }
			
 
				-
			
 
				-    def convert(self, ds: Dataset, format: DatasetFormat, output_path: str, output_type: OutputDatasetType, params: Dict[str, str]):
			
 
				-        if not format in self.formats:
			
 
				-            raise Exception(f"Output Format {format} is not supported, pleased select one of {self.formats.keys()}")
			
 
				-        
			
 
				-        if not output_type in self.exporters:
			
 
				-            raise Exception(f"Output Type {output_type} is not supported, pleased select one of {self.exporters.keys()}")
			
 
				-
			
 
				-        formatter = self.formats[format]
			
 
				-        newds = formatter.format(ds, params)
			
 
				-        exporter = self.exporters[output_type]
			
 
				-        exporter.export(newds, output_path)
			
 
				-
			
 
				-class HuggingFaceDatasetFormatter(DatasetFormatter):
			
 
				-    """
			
 
				-    Returns the HuggingFace Dataset as is
			
 
				-    """
			
 
				-    def format(self, ds: Dataset, params: Dict[str, str]) -> Dataset:
			
 
				-        return ds
			
 
				-
			
 
				-def _remove_all_columns_but(ds: Dataset, keep_columns) -> Dataset:
			
 
				-    """
			
 
				-    HF Dataset doesn't have a way to copy only specific columns of a Dataset so this help
			
 
				-    removes all columns but the ones specified.
			
 
				-    """
			
 
				-    remove_columns = list(ds.column_names)
			
 
				-    for keep in keep_columns:
			
 
				-        remove_columns.remove(keep)
			
 
				-    ds = ds.remove_columns(remove_columns)
			
 
				-    return ds
			
 
				-
			
 
				-class OpenAiCompletionDatasetFormatter(DatasetFormatter):
			
 
				-    """
			
 
				-    Returns the Dataset in the OpenAI Completion Fine-tuning file format with two fields "prompt" and "completion".
			
 
				-    https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset
			
 
				-    """
			
 
				-    def format(self, ds: Dataset, params: Dict[str, str]) -> Dataset:
			
 
				-        newds = ds.rename_columns({'question': 'prompt', 'cot_answer': 'completion'})
			
 
				-        return _remove_all_columns_but(newds, ['prompt', 'completion'])
			
 
				-
			
 
				-class OpenAiChatDatasetFormatter(OpenAiCompletionDatasetFormatter):
			
 
				-    """
			
 
				-    Returns the Dataset in the OpenAI Chat Fine-tuning file format with one field "messages".
			
 
				-    https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset
			
 
				-    """
			
 
				-    def format(self, ds: Dataset, params: Dict[str, str]) -> Dataset:
			
 
				-        newds = super().format(ds, params)
			
 
				-
			
 
				-        def format_messages(row):
			
 
				-            messages = []
			
 
				-            if 'system_prompt' in params:
			
 
				-                system_prompt = params['system_prompt']
			
 
				-                messages.append({ "role": "system", "content": system_prompt})
			
 
				-            messages.extend([{ "role": "user", "content": row['prompt']}, { "role": "assistant", "content": row['completion']}])
			
 
				-            chat_row = {"messages": messages}
			
 
				-            return chat_row
			
 
				-
			
 
				-        newds = newds.map(format_messages)
			
 
				-        return _remove_all_columns_but(newds, ['messages'])
			
 
				-
			
 
				-def append_extension(path: str, extension: str) -> str:
			
 
				-    suffix = "." + extension
			
 
				-    if not path.endswith(suffix):
			
 
				-        path = path + suffix
			
 
				-    return path
			
 
				-
			
 
				-
			
 
				-class JsonlDatasetExporter(DatasetExporter):
			
 
				-    """
			
 
				-    Exports the Dataset to a JSONL file
			
 
				-    """
			
 
				-
			
 
				-    def export(self, ds: Dataset, output_path: str):
			
 
				-        ds.to_json(append_extension(output_path, "jsonl"))
			
 
				-
			
 
				-
			
 
				-class ParquetDatasetExporter(DatasetExporter):
			
 
				-    """
			
 
				-    Exports the Dataset to a Parquet file
			
 
				-    """
			
 
				-
			
 
				-    def export(self, ds: Dataset, output_path: str):
			
 
				-        ds.to_parquet(append_extension(output_path, "parquet"))
			
 
				-
			
 
				-
			
 
				-def main():
			
 
				-    """
			
 
				-    When raft.py is executed from the command line.
			
 
				-    """
			
 
				-    args = get_args()
			
 
				-    ds = load_dataset(args.input_type, data_files={"train": args.input})['train']
			
 
				-    formatter = DatasetConverter()
			
 
				-
			
 
				-    if args.output_chat_system_prompt and args.output_format != "chat":
			
 
				-        raise Exception("Parameter --output-chat-system-prompt can only be used with --output-format chat")
			
 
				-
			
 
				-    format_params = {}
			
 
				-    if args.output_chat_system_prompt:
			
 
				-        format_params['system_prompt'] = args.output_chat_system_prompt
			
 
				-
			
 
				-    formatter.convert(ds=ds, format=args.output_format, output_path=args.output, output_type=args.output_type, params=format_params)
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    main()
			
--- a/recipes/use_cases/end2end-recipes/raft/raft.yaml
+++ b/recipes/use_cases/end2end-recipes/raft/raft.yaml
@@ -7,30 +7,34 @@ COT_prompt_template: >
 
				   <|start_header_id|>user<|end_header_id|>
			
 
				   Question: {question}\nContext: {context}\n<|eot_id|><|start_header_id|>assistant<|end_header_id|>
			
 
				 
			
 
				+# question_prompt_template: >
			
 
				+#   <|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a synthetic question-answer pair generator. Given a chunk of context about
			
 
				+#   some topic(s), generate {num_questions} example questions a user could ask and would be answered
			
 
				+#   using information from the chunk. For example, if the given context was a Wikipedia
			
 
				+#   paragraph about the United States, an example question could be 'How many states are
			
 
				+#   in the United States?
			
 
				+#   The questions should be able to be answered in 100 words or less. Include only the
			
 
				+#   questions in your response.<|eot_id|>
			
 
				+#   <|start_header_id|>user<|end_header_id|>
			
 
				+#   Context: {context}\n <|eot_id|><|start_header_id|>assistant<|end_header_id|>
			
 
				+
			
 
				 question_prompt_template: >
			
 
				-  <|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a synthetic question-answer pair generator. Given a chunk of context about
			
 
				-  some topic(s), generate {num_questions} example questions a user could ask and would be answered
			
 
				-  using information from the chunk. For example, if the given context was a Wikipedia
			
 
				-  paragraph about the United States, an example question could be 'How many states are
			
 
				-  in the United States?
			
 
				-  The questions should be able to be answered in 100 words or less. Include only the
			
 
				-  questions in your response.<|eot_id|>
			
 
				+  <|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a language model skilled in creating quiz questions.
			
 
				+  You will be provided with a document,
			
 
				+  read it and please generate factoid question and answer pairs that are most likely be asked by a user of Llama language models
			
 
				+  which includes LLama, Llama2, Meta Llama3, Code Llama, Meta Llama Guard 1,	Meta Llama Guard 2
			
 
				+  Your factoid questions should be answerable with a specific, concise piece of factual information from the context.
			
 
				+  Your factoid questions should be formulated in the same style as questions users could ask in a search engine.
			
 
				+  This means that your factoid questions MUST NOT mention something like "according to the passage" or "context".
			
 
				+  please make sure you follow those rules:
			
 
				+  1. Generate {num_questions} question answer pairs, you can generate less answer if there is nothing related to
			
 
				+  model, training, fine-tuning and evaluation details of Llama language models,
			
 
				+  2. The questions can be answered based *solely* on the given passage.
			
 
				+  3. Avoid asking questions with similar meaning.
			
 
				+  4. Never use any abbreviation.
			
 
				+  5. The questions should be able to be answered in 60 words or less. Include only the questions in your response. <|eot_id|>
			
 
				   <|start_header_id|>user<|end_header_id|>
			
 
				   Context: {context}\n <|eot_id|><|start_header_id|>assistant<|end_header_id|>
			
 
				-
			
 
				-# question_prompt_template: >
			
 
				-#   You are a language model skilled in creating quiz questions.
			
 
				-#   You will be provided with a document,
			
 
				-#   read it and please generate question and answer pairs that are most likely be asked by a user of Llama language models
			
 
				-#   which includes LLama, Llama2, Meta Llama3, Code Llama, Meta Llama Guard 1,	Meta Llama Guard 2
			
 
				-#   Output only the questions related to Llama:
			
 
				-#   please make sure you follow those rules:
			
 
				-#   1. Generate {num_questions} question answer pairs, you can generate less answer if there is nothing related to model, training, fine-tuning and evaluation details of Llama language models, .
			
 
				-#   2. The questions can be answered based *solely* on the given passage.
			
 
				-#   3. Avoid asking questions with similar meaning.
			
 
				-#   4. Never use any abbreviation.
			
 
				-#   5. Include only the questions in your response.
			
 
				-
			
 
				 data_dir: "./data"
			
 
				 
			
 
				 xml_path: ""
			
--- a/recipes/use_cases/end2end-recipes/raft/eval_raft.py
+++ b/recipes/use_cases/end2end-recipes/raft/eval_raft.py
@@ -8,10 +8,15 @@ import json
 
				 from langchain_openai import ChatOpenAI
			
 
				 from langchain_community.embeddings import HuggingFaceEmbeddings
			
 
				 from langchain_community.vectorstores import FAISS
			
 
				-from langchain.text_splitter import RecursiveCharacterTextSplitter
			
 
				+from langchain.text_splitter import RecursiveCharacterTextSplitter,TokenTextSplitter
			
 
				+from langchain_community.vectorstores.utils import DistanceStrategy
			
 
				+from datetime import datetime
			
 
				 from langchain_community.document_loaders import DirectoryLoader
			
 
				 import re
			
 
				 import string
			
 
				+import pandas as pd 
			
 
				+from langchain.retrievers.document_compressors import FlashrankRerank
			
 
				+from transformers import AutoTokenizer
			
 
				 
			
 
				 
			
 
				 def generate_answers_model_only(model_name,question_list,api_url="http://localhost:8000/v1",key="EMPTY"):
			
@@ -36,28 +41,48 @@ def generate_answers_model_only(model_name,question_list,api_url="http://localho
 
				 def format_docs_raft(docs):
			
 
				     context = ""
			
 
				     for doc in docs:
			
 
				-        context += "<DOCUMENT>" + str(doc.page_content) + "</DOCUMENT>\n"
			
 
				+        context += "\n<DOCUMENT>" + str(doc.page_content) + "</DOCUMENT>\n"
			
 
				     return context
			
 
				-def format_docs(docs):
			
 
				-    return "\n\n".join(doc.page_content for doc in docs)
			
 
				-def generate_answers_with_RAG(model_name, question_list,api_config,api_url_overwrite=None):
			
 
				-    data_dir = api_config['data_dir']
			
 
				-    api_url = "http://localhost:"+str(api_config['vllm_endpoint'])+"/v1"
			
 
				-    if api_url_overwrite:
			
 
				-        api_url = api_url_overwrite
			
 
				-    key = api_config['api_key']
			
 
				+def build_retriever(api_config,embedding_model_name,retrieved_docs_num=5):
			
 
				     # Use langchain to load the documents from data directory
			
 
				-    loader = DirectoryLoader(data_dir)
			
 
				+    loader = DirectoryLoader(api_config['data_dir'])
			
 
				     docs = loader.load()
			
 
				     # Split the document into chunks with a specified chunk size
			
 
				-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=api_config["chunk_size"], chunk_overlap=int(api_config["chunk_size"]/10))
			
 
				-    all_splits = text_splitter.split_documents(docs)
			
 
				+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=api_config["chunk_size"],chunk_overlap=int(api_config["chunk_size"] / 10),add_start_index=True,strip_whitespace=True)
			
 
				+    # text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
			
 
				+    #     AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B"),
			
 
				+    #     chunk_size=api_config["chunk_size"],
			
 
				+    #     chunk_overlap=int(api_config["chunk_size"] / 10),
			
 
				+    #     add_start_index=True,
			
 
				+    #     strip_whitespace=True,
			
 
				+    #     separators=["\n\n", "\n", ".", " ", ""],
			
 
				+    # )
			
 
				+    docs_processed = text_splitter.split_documents(docs)
			
 
				+    # Remove duplicates
			
 
				+    unique_texts = {}
			
 
				+    docs_processed_unique = []
			
 
				+    for doc in docs_processed:
			
 
				+        if doc.page_content not in unique_texts:
			
 
				+            unique_texts[doc.page_content] = True
			
 
				+            docs_processed_unique.append(doc)
			
 
				 
			
 
				     # Store the document into a vector store with a specific embedding model
			
 
				-    vectorstore = FAISS.from_documents(all_splits, HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2",model_kwargs={'device': 'cuda'}))
			
 
				+    embedding_model = HuggingFaceEmbeddings(
			
 
				+        model_name=embedding_model_name,
			
 
				+        model_kwargs={"device": "cuda"},
			
 
				+        encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
			
 
				+    )
			
 
				+    vectorstore = FAISS.from_documents(docs_processed_unique, embedding_model, distance_strategy=DistanceStrategy.COSINE)
			
 
				     retriever = vectorstore.as_retriever(
			
 
				-        search_kwargs={"k": 5}
			
 
				+        search_kwargs={"k": retrieved_docs_num},
			
 
				     )
			
 
				+    return retriever
			
 
				+def generate_answers_with_RAG(model_name, question_list,api_config,retriever,api_url_overwrite=None):
			
 
				+    api_url = "http://localhost:"+str(api_config['vllm_endpoint'])+"/v1"
			
 
				+    if api_url_overwrite:
			
 
				+        api_url = api_url_overwrite
			
 
				+    key = api_config['api_key']
			
 
				+    rerank_topk = api_config["rerank_topk"]
			
 
				     # Load the RAFT model
			
 
				     llm = ChatOpenAI(
			
 
				         openai_api_key=key,
			
@@ -68,13 +93,14 @@ def generate_answers_with_RAG(model_name, question_list,api_config,api_url_overw
 
				         )
			
 
				     all_tasks = []
			
 
				     for q in question_list:
			
 
				-        # retrive the top 6 documents
			
 
				-        retrieved_docs = retriever.invoke(q)
			
 
				+        # retrive the top K documents
			
 
				+        retrieved_docs = retriever.invoke(q)        
			
 
				+        if rerank_topk:
			
 
				+            ranker = FlashrankRerank(top_n=rerank_topk)
			
 
				+            documents = ranker.compress_documents(retrieved_docs,q)
			
 
				         # format the documents into a string
			
 
				-        if '8B-Instruct' in model_name:
			
 
				-            documents = format_docs(retrieved_docs)
			
 
				-        else:
			
 
				-            documents = format_docs_raft(retrieved_docs)
			
 
				+
			
 
				+        documents = format_docs_raft(retrieved_docs)
			
 
				         # create a prompt
			
 
				         text = api_config["RAG_prompt_template"].format(context=documents,question=q)
			
 
				         all_tasks.append(text)
			
@@ -157,10 +183,10 @@ def compute_judge_score(questions: list, generated : list, reference: list, api_
 
				         message = api_config['judge_prompt_template'].format(question=question,prediction=prediction,gold=gold)
			
 
				         all_tasks.append(message)
			
 
				     judge_responses = llm.batch(all_tasks)
			
 
				-    judge_responses = ["YES" in item.content.split("<ANSWER>")[-1] for item in judge_responses]
			
 
				+    judge_responses = ["YES" in item.content for item in judge_responses]
			
 
				     correct_num = sum(judge_responses)
			
 
				     return correct_num/len(questions),judge_responses
			
 
				-def score_single(api_config,generated,reference,questions, run_exact_match=True,run_rouge=True, run_bert=True, run_llm_as_judge=True):
			
 
				+def score_single(api_config,generated,reference,questions, run_exact_match=True,run_rouge=True, run_bert=False, run_llm_as_judge=True):
			
 
				     # set metric to default -1, means no metric is computed
			
 
				     metric = {
			
 
				         "Rouge_score": -1,
			
@@ -196,12 +222,18 @@ def main(api_config):
 
				     try:
			
 
				         api_url = "http://localhost:"+str(api_config["vllm_endpoint"])+"/v1"
			
 
				         logging.info("Starting to generate answer given the eval set.")
			
 
				-        with open(api_config["eval_json"]) as fp:
			
 
				-            eval_json = json.load(fp)
			
 
				         questions,groud_truth = [],[]
			
 
				-        for index, item in enumerate(eval_json):
			
 
				-            questions.append(item["question"])
			
 
				-            groud_truth.append(item["answer"])
			
 
				+        if api_config["eval_file"].endswith(".parquet"):
			
 
				+            eval_file = pd.read_parquet(api_config["eval_file"],filters=[('source', '=', 'pt_discuss_forum')])
			
 
				+            for index, item in eval_file.iterrows():
			
 
				+                questions.append(item["question"]+"\nDetails:\n"+item["context"])
			
 
				+                groud_truth.append(item["answer"])
			
 
				+        else:
			
 
				+            with open(api_config["eval_file"]) as fp:
			
 
				+                eval_file = json.load(fp)
			
 
				+                for index, item in enumerate(eval_file):
			
 
				+                    questions.append(item["question"])
			
 
				+                    groud_truth.append(item["answer"])
			
 
				         generated_answers = {
			
 
				             "RAFT": [],
			
 
				             "RAFT_RAG": [],
			
@@ -211,29 +243,30 @@ def main(api_config):
 
				             "70B_Base": [],
			
 
				             
			
 
				         }
			
 
				-        # Generate answers for baseline
			
 
				-        base_model_name = api_config["base_model_name"]
			
 
				-        generated_answers["Baseline"] = generate_answers_model_only(base_model_name,questions,api_url)
			
 
				-        generated_answers["Baseline_RAG"] = generate_answers_with_RAG(base_model_name, questions,api_config)
			
 
				-        # Generate answers for RAFT
			
 
				-        raft_model_name = api_config["raft_model_name"]
			
 
				-        generated_answers["RAFT"] = generate_answers_model_only(raft_model_name,questions,api_url)
			
 
				-        generated_answers["RAFT_RAG"] = generate_answers_with_RAG(raft_model_name, questions,api_config)
			
 
				-
			
 
				+        # build retriver
			
 
				+        retriever = build_retriever(api_config,"sentence-transformers/multi-qa-mpnet-base-cos-v1",api_config["rag_topk"])
			
 
				+        # Generate answers for 8B models
			
 
				+        model_name = api_config["model_name"]
			
 
				+        generated_answers[model_name] = generate_answers_model_only(model_name,questions,api_url)
			
 
				+        generated_answers[model_name+"_RAG"] = generate_answers_with_RAG(model_name, questions,api_config,retriever)
			
 
				+        print("Finished generating answers for ", model_name)
			
 
				         large_model_name = "meta-llama/Meta-Llama-3-70B-Instruct"
			
 
				         large_api_url = "http://localhost:"+str(api_config["judge_endpoint"])+"/v1"
			
 
				         generated_answers["70B_Base"] = generate_answers_model_only(large_model_name,questions,large_api_url)
			
 
				-        generated_answers["70B_RAG"] = generate_answers_with_RAG(large_model_name, questions,api_config,large_api_url,)
			
 
				-        logging.info(f"Successfully generated {len(generated_answers['Baseline_RAG'])} answers for all models.")
			
 
				+        generated_answers["70B_RAG"] = generate_answers_with_RAG(large_model_name, questions,api_config,retriever,large_api_url)
			
 
				+        print("Finished generating answers for ", large_model_name)
			
 
				+        logging.info(f"Successfully generated {len(generated_answers[model_name])} answers for all models.")
			
 
				         # for generate answer from each model, compute the score metric
			
 
				         all_metrics = []
			
 
				+        output_file = api_config["output_log"]+str(datetime.now().strftime("%Y%m%d_%H%M%S"))
			
 
				+
			
 
				         for model_name,model_answer in generated_answers.items():
			
 
				             if len(model_answer) != len(groud_truth):
			
 
				                 print(f"The length of {model_name} answer is not equal to the length of ground truth.")
			
 
				                 continue
			
 
				             metric = score_single(api_config,model_answer,groud_truth,questions)
			
 
				             print(f"The eval result for {model_name} is: {metric}")
			
 
				-            with open(api_config["output_log"],"a") as fp:
			
 
				+            with open(output_file,"a") as fp:
			
 
				                 fp.write(f"Eval_result for {model_name} \n")
			
 
				                 fp.write(f"Rouge_score: {metric['Rouge_score']} \n")
			
 
				                 fp.write(f"BERTScore Precision: {metric['BERTScore_Precision']:.4f}, Recall: {metric['BERTScore_Recall']:.4f}, F1: {metric['BERTScore_F1']:.4f} \n")
			
@@ -254,20 +287,21 @@ def main(api_config):
 
				         # Now we want to take a closer look at the questions that are not answered the same by all the models.
			
 
				         judge_zip = list(zip(*[item[-1] for item in all_metrics]))
			
 
				         model_names = [item[0] for item in all_metrics]
			
 
				-        with open(api_config["output_log"],"a") as fp:
			
 
				+        with open(output_file,"a") as fp:
			
 
				             for item in all_metrics:
			
 
				                 fp.write(f"Model_Name: {item[0]}, LLM_SCORE: {item[1]} \n")
			
 
				             for idx,item in enumerate(judge_zip):
			
 
				-                # if all the responses are "YES" or all the responses are "NO", then we skip this question
			
 
				-                if sum([r=="YES" for r in item]) == len(item) or sum([r=="YES" for r in item]) == 0:
			
 
				+                # if all the responses are "YES", then we skip this question
			
 
				+                if sum(item) == len(item):
			
 
				                     continue 
			
 
				                 else:
			
 
				                     fp.write(f"Comparing interested question: {questions[idx]} \n")
			
 
				                     fp.write(f"groud_truth: {groud_truth[idx]} \n")
			
 
				                     for i in range(len(model_names)):
			
 
				                         fp.write(f"{item[i]} {model_names[i]}_answers: {generated_answers[model_names[i]][idx]} \n")
			
 
				-                    fp.write("-------\n")
			
 
				-
			
 
				+                    fp.write("------------------------\n")
			
 
				+            fp.write(json.dumps(all_metrics))
			
 
				+        print("Finished evaluating the model.")
			
 
				 
			
 
				 
			
 
				         logging.info(f"Eval successfully, the eval result is saved to {api_config['output_log']}.")
			
@@ -281,13 +315,13 @@ def parse_arguments():
 
				         description="Generate question/answer pairs from documentation."
			
 
				     )
			
 
				     parser.add_argument(
			
 
				-        "-m", "--raft_model_name",
			
 
				+        "-m", "--model_name",
			
 
				         default=None,
			
 
				-        help="Provide the raft_model_name to use for evaluation. If not specified, the model_path in eval_config.yaml will be used."
			
 
				+        help="Provide the model_name to use for evaluation. If not specified, the model_path in eval_config.yaml will be used."
			
 
				     )
			
 
				     parser.add_argument(
			
 
				         "-c", "--config_path",
			
 
				-        default="eval_config.yaml",
			
 
				+        default="raft_eval_config.yaml",
			
 
				         help="Set the configuration file path that has system prompt along with language, evalset path."
			
 
				     )
			
 
				     parser.add_argument(
			
@@ -309,8 +343,8 @@ def parse_arguments():
 
				     )
			
 
				     parser.add_argument(
			
 
				         "-o", "--output_log",
			
 
				-        default="eval_result.log",
			
 
				-        help="save the eval result to a log file. Default is eval_result.log"
			
 
				+        default="./eval_result",
			
 
				+        help="save the eval result to a log file. Default is eval_result[timestamp].log"
			
 
				     )
			
 
				     parser.add_argument(
			
 
				         "-k", "--api_key",
			
@@ -318,6 +352,18 @@ def parse_arguments():
 
				         type=str,
			
 
				         help="LLM API key for generating question/answer pairs."
			
 
				     )
			
 
				+    parser.add_argument(
			
 
				+        "-r", "--rag_topk",
			
 
				+        default=5,
			
 
				+        type=int,
			
 
				+        help="set the number of top k documents the RAG needs to retrive."
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--rerank_topk",
			
 
				+        default=0,
			
 
				+        type=int,
			
 
				+        help="set the number of top k documents the reranker needs to retrive."
			
 
				+    )
			
 
				     parser.add_argument("--chunk_size", type=int, default=1000, help="The character size of each chunk used in RAG")
			
 
				     return parser.parse_args()
			
 
				 
			
@@ -329,11 +375,15 @@ if __name__ == "__main__":
 
				     if args.data_dir:
			
 
				         api_config["data_dir"] = args.data_dir
			
 
				     if args.raft_model_name:
			
 
				-        api_config["raft_model_name"] = args.raft_model_name
			
 
				+        api_config["model_name"] = args.model_name
			
 
				     api_config["judge_endpoint"] = args.judge_endpoint
			
 
				     api_config["output_log"] = args.output_log
			
 
				     api_config["api_key"] = args.api_key
			
 
				     api_config["chunk_size"] = args.chunk_size
			
 
				+    api_config["rag_topk"] = args.rag_topk
			
 
				+    api_config["rerank_topk"] = args.rerank_topk
			
 
				+    if api_config["rag_topk"] < api_config["rerank_topk"]:
			
 
				+        logging.error("The rerank_topk should be smaller than rag_topk.")
			
 
				     if api_config["judge_endpoint"]:
			
 
				         logging.info(f"Use local vllm service for judge at port: '{args.judge_endpoint}'.")
			
 
				     main(api_config)
			
--- a/recipes/use_cases/end2end-recipes/raft/raft_eval_config.yaml
+++ b/recipes/use_cases/end2end-recipes/raft/raft_eval_config.yaml
@@ -0,0 +1,36 @@
 
				+eval_prompt_template: >
			
 
				+  <|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a AI assistant that skilled in answering questions related to Llama language models,
			
 
				+  which includes LLama, Llama2, Meta Llama3, Code Llama, Meta Llama Guard 1,	Meta Llama Guard 2,
			
 
				+  Below is a question from a llama user, please the answer it with best of your knowledge,
			
 
				+  The returned answer should be no more than 100 words.Please return the answers in text directly without any special tokens.<|eot_id|>
			
 
				+  <|start_header_id|>user<|end_header_id|>
			
 
				+  Question:{question} \n <|eot_id|><|start_header_id|>assistant<|end_header_id|>
			
 
				+judge_prompt_template: >
			
 
				+    <|begin_of_text|><|start_header_id|>system<|end_header_id|>You have been provided with a question, a teacher's answer and a student's answer below.
			
 
				+    Given that question, you need to score the how good the student answer is compare to
			
 
				+    the teacher's answer. If the student's answer is correct based on the teacher's answer, then return YES, else return NO.
			
 
				+    Here are the grade criterias to follow:
			
 
				+    1. Review it carefully to make sure that the keywords and numerical vaules are exactly the same.
			
 
				+    2. Ensure that the student answer does not contain any conflicting statements.
			
 
				+    3. It is OK if the student answer contains more information than the ground truth answer, as long as it is factually accurate relative to the  ground truth answer.
			
 
				+    YES means that the student's answer meets all of the criteria.
			
 
				+    NO means that the student's answer does not meet all of the criteria. This is the lowest possible score you can give.
			
 
				+    Only respond with "YES" or "NO", do not respond with anything else.<|eot_id|>
			
 
				+    <|start_header_id|>user<|end_header_id|>
			
 
				+    Question: {question} \n Teacher's Answer: {gold} \n Student's Answer: {prediction} <|eot_id|><|start_header_id|>assistant<|end_header_id|>
			
 
				+RAG_prompt_template: >
			
 
				+  <|begin_of_text|><|start_header_id|>system<|end_header_id|> Answer the following question using the information given in the context below. Here is things to pay attention to:
			
 
				+    1.The context contains many documents, each document starts with <DOCUMENT> and ends </DOCUMENT>.
			
 
				+    2.First provide step-by-step reasoning on how to answer the question.
			
 
				+    3.In the reasoning, if you need to copy paste some sentences from the context, include them in ##begin_quote## and ##end_quote##. This would mean that things outside of ##begin_quote## and ##end_quote## are not directly copy paste from the context.
			
 
				+    4.End your response with final answer in the form <ANSWER>: $answer, the answer should less than 60 words.
			
 
				+    You MUST begin your final answer with the tag "<ANSWER>:<|eot_id|>
			
 
				+  <|start_header_id|>user<|end_header_id|>
			
 
				+  Question: {question}\nContext: {context}\n<|eot_id|><|start_header_id|>assistant<|end_header_id|>
			
 
				+eval_file: "./eval_llama.json"
			
 
				+
			
 
				+model_name: "raft-8b"
			
 
				+
			
 
				+data_dir: "./data"
			
 
				+
			
 
				+rag_topk: 5
			
--- a/recipes/use_cases/end2end-recipes/raft/raft_utils.py
+++ b/recipes/use_cases/end2end-recipes/raft/raft_utils.py
@@ -3,8 +3,7 @@
 
				 
			
 
				 import os
			
 
				 import logging
			
 
				-from langchain_community.embeddings import HuggingFaceEmbeddings
			
 
				-from langchain_experimental.text_splitter import SemanticChunker
			
 
				+from langchain.text_splitter import RecursiveCharacterTextSplitter
			
 
				 from math import ceil
			
 
				 from datasets import Dataset
			
 
				 import random
			
@@ -90,7 +89,7 @@ def read_file_content(xml_path: str, data_folder: str) -> str:
 
				 def get_chunks(
			
 
				     text: str,
			
 
				     chunk_size: int = 512,
			
 
				-    embedding_model: str = None
			
 
				+    api_config: dict = None,
			
 
				 ) -> list[str]:
			
 
				     """
			
 
				     Takes in a `file_path` and `doctype`, retrieves the document, breaks it down into chunks of size
			
@@ -102,7 +101,7 @@ def get_chunks(
 
				     else:
			
 
				         num_chunks = ceil(len(text) / chunk_size)
			
 
				         logging.info(f"Splitting text into {num_chunks} chunks")
			
 
				-        text_splitter = SemanticChunker(embedding_model, number_of_chunks=num_chunks)
			
 
				+        text_splitter = RecursiveCharacterTextSplitter(chunk_size=api_config["chunk_size"], chunk_overlap=int(api_config["chunk_size"]/10))
			
 
				         chunks = text_splitter.create_documents([text])
			
 
				         chunks = [chunk.page_content for chunk in chunks]
			
 
				 
			
@@ -116,8 +115,7 @@ def generate_questions(api_config):
 
				     document_text = read_file_content(api_config["xml_path"],api_config["data_dir"])
			
 
				     if len(document_text) == 0:
			
 
				         logging.info(f"Error reading files, document_text is {len(document_text)}")
			
 
				-    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2",model_kwargs={'device': 'cuda'})
			
 
				-    document_batches = get_chunks(document_text,api_config["chunk_size"],embedding_model)
			
 
				+    document_batches = get_chunks(document_text,api_config["chunk_size"],api_config)
			
 
				     # use OpenAI API protocol to hanlde the chat request, including local VLLM openai compatible server
			
 
				     llm = ChatOpenAI(
			
 
				         openai_api_key=key,