فهرست منبع

add more eval QA

Kai Wu 1 سال پیش
والد
کامیت
ddb7f1c15c

+ 1 - 1
recipes/use_cases/end2end-recipes/raft/README.md

@@ -22,7 +22,7 @@ CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server  --model m
 Once the server is ready, we can query the server given the port number 8001 in another terminal. Here, "-u" sets the endpoint url to query and "-t" sets the number of questions we ask the Meta Llama3 70B Instruct model to generate per chunk. To use cloud API , please change the endpoint url to the cloud provider and set the api key using "-k". Here since we want to query our local hosted VLLM server, we can use following commend:
 
 ```bash
-python raft.py -u "http://localhost:8001/v1" -k "EMPTY" -t 3
+python raft.py -u "http://localhost:8001/v1" -k "EMPTY" -t 5
 ```
 
 For cloud API key, we can also set it using system environment variables, such as

تفاوت فایلی نمایش داده نمی شود زیرا این فایل بسیار بزرگ است
+ 0 - 2
recipes/use_cases/end2end-recipes/raft/data/website_data


+ 0 - 6
recipes/use_cases/end2end-recipes/raft/data_urls.xml

@@ -102,18 +102,12 @@
 <loc>http://raw.githubusercontent.com/meta-llama/llama/main/README.md</loc>
 </url>
 <url>
-<loc>http://raw.githubusercontent.com/meta-llama/llama/main/LICENSE.md</loc>
-</url>
-<url>
 <loc>http://raw.githubusercontent.com/meta-llama/llama3/main/MODEL_CARD.md</loc>
 </url>
 <url>
 <loc>http://raw.githubusercontent.com/meta-llama/llama3/main/README.md</loc>
 </url>
 <url>
-<loc>http://raw.githubusercontent.com/meta-llama/llama3/main/LICENSE.md</loc>
-</url>
-<url>
 <loc>http://raw.githubusercontent.com/meta-llama/codellama/main/MODEL_CARD.md</loc>
 </url>
 <url>

+ 7 - 15
recipes/use_cases/end2end-recipes/raft/eval_raft.py

@@ -5,21 +5,14 @@ import evaluate
 import argparse
 from config import load_config
 import json
-from itertools import chain
 from langchain_openai import ChatOpenAI
-
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.vectorstores import FAISS
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.document_loaders import DirectoryLoader
-from langchain_core.runnables import RunnablePassthrough
-
-from langchain_core.messages import HumanMessage, SystemMessage
 import re
 import string
-from collections import Counter
-from langchain_core.output_parsers import StrOutputParser
-from langchain.prompts.prompt import PromptTemplate
+
 
 def generate_answers_model_only(model_name,question_list,api_url="http://localhost:8000/v1",key="EMPTY"):
         # Use langchain to load the documents from data directory
@@ -57,7 +50,7 @@ def generate_answers_with_RAG(model_name, question_list,api_config,api_url_overw
     loader = DirectoryLoader(data_dir)
     docs = loader.load()
     # Split the document into chunks with a specified chunk size
-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=api_config["chunk_size"], chunk_overlap=int(api_config["chunk_size"]/10))
     all_splits = text_splitter.split_documents(docs)
 
     # Store the document into a vector store with a specific embedding model
@@ -260,6 +253,7 @@ def main(api_config):
                 fp.write("\n------------------------------------\n")
         # Now we want to take a closer look at the questions that are not answered the same by all the models.
         judge_zip = list(zip(*[item[-1] for item in all_metrics]))
+        model_names = [item[0] for item in all_metrics]
         with open(api_config["output_log"],"a") as fp:
             for item in all_metrics:
                 fp.write(f"Model_Name: {item[0]}, LLM_SCORE: {item[1]} \n")
@@ -270,12 +264,8 @@ def main(api_config):
                 else:
                     fp.write(f"Comparing interested question: {questions[idx]} \n")
                     fp.write(f"groud_truth: {groud_truth[idx]} \n")
-                    fp.write(f"{item[2]} Baseline_answers: {generated_answers['Baseline'][idx]} \n")
-                    fp.write(f"{item[3]} Baseline_RAG_answers: {generated_answers['Baseline_RAG'][idx]} \n")
-                    fp.write(f"{item[0]} RAFT_answers: {generated_answers['RAFT'][idx]} \n")
-                    fp.write(f"{item[1]} RAFT_RAG_answers: {generated_answers['RAFT_RAG'][idx]} \n")
-                    fp.write(f"{item[4]} 70B_Base_answers: {generated_answers['70B_Base'][idx]} \n")
-                    fp.write(f"{item[5]} 70B_RAG_answers: {generated_answers['70B_RAG'][idx]} \n")
+                    for i in range(len(model_names)):
+                        fp.write(f"{item[i]} {model_names[i]}_answers: {generated_answers[model_names[i]][idx]} \n")
                     fp.write("-------\n")
 
 
@@ -328,6 +318,7 @@ def parse_arguments():
         type=str,
         help="LLM API key for generating question/answer pairs."
     )
+    parser.add_argument("--chunk_size", type=int, default=1000, help="The character size of each chunk used in RAG")
     return parser.parse_args()
 
 if __name__ == "__main__":
@@ -342,6 +333,7 @@ if __name__ == "__main__":
     api_config["judge_endpoint"] = args.judge_endpoint
     api_config["output_log"] = args.output_log
     api_config["api_key"] = args.api_key
+    api_config["chunk_size"] = args.chunk_size
     if api_config["judge_endpoint"]:
         logging.info(f"Use local vllm service for judge at port: '{args.judge_endpoint}'.")
     main(api_config)

تفاوت فایلی نمایش داده نمی شود زیرا این فایل بسیار بزرگ است
+ 217 - 129
recipes/use_cases/end2end-recipes/raft/evalset.json


+ 2 - 2
recipes/use_cases/end2end-recipes/raft/raft.py

@@ -70,8 +70,8 @@ def parse_arguments():
         type=str,
         help="LLM API key for generating question/answer pairs."
     )
-    parser.add_argument("--chunk_size", type=int, default=512, help="The size of each chunk in number of tokens")
-    parser.add_argument("-o","--output", type=str, default="./", help="The path at which to save the dataset")
+    parser.add_argument("--chunk_size", type=int, default=1000, help="The size of each chunk in number of tokens")
+    parser.add_argument("-o","--output", type=str, default="./output/", help="The path at which to save the dataset")
     parser.add_argument("--output-format", type=str, default="hf", help="Format to convert the dataset to. Defaults to hf.", choices=datasetFormats)
     parser.add_argument("--output-type", type=str, default="jsonl", help="Type to export the dataset to. Defaults to jsonl.", choices=outputDatasetTypes)
     return parser.parse_args()

+ 1 - 1
recipes/use_cases/end2end-recipes/raft/raft.yaml

@@ -31,7 +31,7 @@ question_prompt_template: >
 #   4. Never use any abbreviation.
 #   5. Include only the questions in your response.
 
-data_dir: "/home/kaiwu/work/pytorch/docs"
+data_dir: "./data"
 
 xml_path: ""
 

+ 7 - 15
recipes/use_cases/end2end-recipes/raft/raft_utils.py

@@ -2,21 +2,15 @@
 # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
 
 import os
-from transformers import  AutoTokenizer
 import logging
-import json
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_experimental.text_splitter import SemanticChunker
 from math import ceil
-import datasets
-from datasets import Dataset, load_dataset
+from datasets import Dataset
 import random
 from langchain_community.document_loaders import SitemapLoader,DirectoryLoader
 from bs4 import BeautifulSoup
-from langchain_openai import ChatOpenAI
-from langchain_core.messages import HumanMessage, SystemMessage
-from langchain_community.llms import ChatOpenAI
-from langchain_core.prompts import ChatPromptTemplate
+
 from langchain_openai import ChatOpenAI
 
 
@@ -124,21 +118,19 @@ def generate_questions(api_config):
         logging.info(f"Error reading files, document_text is {len(document_text)}")
     embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2",model_kwargs={'device': 'cuda'})
     document_batches = get_chunks(document_text,api_config["chunk_size"],embedding_model)
-
-    batches_count = len(document_batches)
-    total_questions = api_config["questions_per_chunk"] * batches_count
     # use OpenAI API protocol to hanlde the chat request, including local VLLM openai compatible server
     llm = ChatOpenAI(
         openai_api_key=key,
         openai_api_base=api_url,
         model_name=api_config["model"],
         temperature=0.0,
-        max_tokens=250
+        max_tokens=500
         )
     all_tasks = [api_config['question_prompt_template'].format(num_questions=str(api_config['questions_per_chunk']),context=document) for document in document_batches]
     generated_answers = llm.batch(all_tasks)
+    generated_answers = [ item.content for item in generated_answers]
     if len(generated_answers) == 0:
-        logging.error("No model answers generated. Please check the input context or model configuration in ",model_name)
+        logging.error("No model answers generated. Please check the input context or model configuration in ",api_config["model"])
         return []
     final_result = []
     for result in generated_answers:
@@ -167,9 +159,10 @@ def generate_COT(chunk_questions_zip,api_config) -> dict:
         openai_api_base=api_config["endpoint_url"],
         model_name=api_config["model"],
         temperature=0.0,
-        max_tokens=350
+        max_tokens=500
         )
     generated_answers = llm.batch(all_tasks)
+    generated_answers = [ item.content for item in generated_answers]
     COT_results = []
     # return a list of (chunk, question, generated_answer)
     for (chunk, question),generated_answer in zip(chunk_questions,generated_answers):
@@ -186,7 +179,6 @@ def add_chunk_to_dataset(
     """
     Given a chunk and related questions lists, create {Q, A, D} triplets and add them to the dataset.
     """
-    COT_tasks = []
     chunks = [chunk for chunk, _ in chunk_questions_zip]
     COT_results = generate_COT(chunk_questions_zip,api_config)
     for chunk, q , cot in COT_results: