Ver código fonte

modified readme and requirement.txt

Kai Wu 10 meses atrás
pai
commit
afcc874590

+ 12 - 1
.github/scripts/spellcheck_conf/wordlist.txt

@@ -1350,4 +1350,15 @@ SalesBot
 Weaviate
 MediaGen
 SDXL
-SVD
+SVD
+LLMScore
+RecursiveCharacterTextSplitter
+TPD
+TPM
+Tianjun
+Zhang
+distractor
+distractors
+frac
+numRefusal
+totalQA

Diferenças do arquivo suprimidas por serem muito extensas
+ 102 - 94
recipes/use_cases/end2end-recipes/raft/README.md


+ 0 - 1
recipes/use_cases/end2end-recipes/raft/config.py

@@ -2,7 +2,6 @@
 # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
 
 import yaml
-import os
 
 def load_config(config_path: str = "./config.yaml"):
     # Read the YAML configuration file

+ 3 - 39
recipes/use_cases/end2end-recipes/raft/raft_eval.py

@@ -15,7 +15,6 @@ from langchain_community.document_loaders import DirectoryLoader
 import re
 import string
 import pandas as pd 
-from langchain.retrievers.document_compressors import FlashrankRerank
 
 
 def generate_answers_model_only(model_name,question_list,api_url="http://localhost:8000/v1",key="EMPTY"):
@@ -73,7 +72,6 @@ def generate_answers_with_RAG(model_name, question_list,api_config,retriever,api
     if api_url_overwrite:
         api_url = api_url_overwrite
     key = api_config['api_key']
-    rerank_topk = api_config["rerank_topk"]
     # Load the RAFT model
     llm = ChatOpenAI(
         openai_api_key=key,
@@ -86,11 +84,7 @@ def generate_answers_with_RAG(model_name, question_list,api_config,retriever,api
     for q in question_list:
         # retrive the top K documents
         retrieved_docs = retriever.invoke(q)        
-        if rerank_topk:
-            ranker = FlashrankRerank(top_n=rerank_topk)
-            documents = ranker.compress_documents(retrieved_docs,q)
         # format the documents into a string
-
         documents = format_docs_raft(retrieved_docs)
         # create a prompt
         text = api_config["RAG_prompt_template"].format(context=documents,question=q)
@@ -149,17 +143,6 @@ def exact_match_score(prediction, ground_truth):
         if (normalize_answer(pred) == normalize_answer(gold)):
             num_match += 1
     return num_match/len(ground_truth)
-def compute_bert_score(generated : list, reference: list):
-    bertscore = evaluate.load("bertscore")
-    score = bertscore.compute(
-        predictions=generated,
-        references=reference,
-        lang="en"
-    )
-    f1 = score["f1"]
-    precision = score["precision"]
-    recall = score["recall"]
-    return sum(precision)/len(precision), sum(recall)/len(recall), sum(f1)/len(f1)
 def compute_judge_score(questions: list, generated : list, reference: list, api_config,api_url="http://localhost:8001/v1",key="EMPTY"):
     correct_num = 0
     model_name = "meta-llama/Meta-Llama-3-70B-Instruct"
@@ -177,13 +160,10 @@ def compute_judge_score(questions: list, generated : list, reference: list, api_
     judge_responses = ["YES" in item.content for item in judge_responses]
     correct_num = sum(judge_responses)
     return correct_num/len(questions),judge_responses
-def score_single(api_config,generated,reference,questions, run_exact_match=True,run_rouge=True, run_bert=False, run_llm_as_judge=True):
+def score_single(api_config,generated,reference,questions, run_exact_match=True,run_rouge=True, run_llm_as_judge=True):
     # set metric to default -1, means no metric is computed
     metric = {
         "Rouge_score": -1,
-        "BERTScore_Precision": -1,
-        "BERTScore_Recall": -1,
-        "BERTScore_F1": -1,
         "LLM_judge_score": -1,
         "Exact_match": -1
     }
@@ -191,12 +171,6 @@ def score_single(api_config,generated,reference,questions, run_exact_match=True,
         rouge_score = compute_rouge_score(generated,reference)
         metric["Rouge_score"] = rouge_score
         print("Rouge_score:",rouge_score)
-    if run_bert:
-        P, R, F1 = compute_bert_score(generated,reference)
-        print(f"BERTScore Precision: {P:.4f}, Recall: {R:.4f}, F1: {F1:.4f}")
-        metric["BERTScore_Precision"] = P
-        metric["BERTScore_Recall"] = R
-        metric["BERTScore_F1"] = F1
     if api_config["judge_endpoint_url"] and run_llm_as_judge:
         api_url = api_config["judge_endpoint_url"]
         LLM_judge_score,judge_responses = compute_judge_score(questions, generated, reference, api_config,api_url=api_url)
@@ -235,8 +209,8 @@ def main(api_config):
         print("Finished generating answers for ", model_name)
         large_model_name = "meta-llama/Meta-Llama-3-70B-Instruct"
         large_api_url = api_config["judge_endpoint_url"]
-        #generated_answers["70B_Base"] = generate_answers_model_only(large_model_name,questions,large_api_url)
-        #generated_answers["70B_RAG"] = generate_answers_with_RAG(large_model_name, questions,api_config,retriever,large_api_url)
+        generated_answers["70B_Base"] = generate_answers_model_only(large_model_name,questions,large_api_url)
+        generated_answers["70B_RAG"] = generate_answers_with_RAG(large_model_name, questions,api_config,retriever,large_api_url)
         print("Finished generating answers for ", large_model_name)
         logging.info(f"Successfully generated {len(generated_answers[model_name+'_RAG'])} answers for all models.")
         # for generate answer from each model, compute the score metric
@@ -252,7 +226,6 @@ def main(api_config):
             with open(output_file,"a") as fp:
                 fp.write(f"Eval_result for {model_name} \n")
                 fp.write(f"Rouge_score: {metric['Rouge_score']} \n")
-                fp.write(f"BERTScore Precision: {metric['BERTScore_Precision']:.4f}, Recall: {metric['BERTScore_Recall']:.4f}, F1: {metric['BERTScore_F1']:.4f} \n")
                 fp.write(f"Exact_match_percentage: {metric['Exact_match']} \n")
                 judge_responses = ["None"] * len(questions)
                 if api_config["judge_endpoint_url"]:
@@ -341,12 +314,6 @@ def parse_arguments():
         type=int,
         help="set the number of top k documents the RAG needs to retrive."
     )
-    parser.add_argument(
-        "--rerank_topk",
-        default=0,
-        type=int,
-        help="set the number of top k documents the reranker needs to retrive."
-    )
     parser.add_argument("--chunk_size", type=int, default=1000, help="The character size of each chunk used in RAG")
     return parser.parse_args()
 
@@ -364,9 +331,6 @@ if __name__ == "__main__":
     api_config["api_key"] = args.api_key
     api_config["chunk_size"] = args.chunk_size
     api_config["rag_topk"] = args.rag_topk
-    api_config["rerank_topk"] = args.rerank_topk
-    if api_config["rag_topk"] < api_config["rerank_topk"]:
-        logging.error("The rerank_topk should be smaller than rag_topk.")
     if api_config["judge_endpoint_url"]:
         logging.info(f"The judge model url is: '{args.judge_endpoint_url}'.")
     main(api_config)

+ 0 - 9
requirements.txt

@@ -19,19 +19,10 @@ chardet
 openai
 typing-extensions==4.8.0
 tabulate
-octoai
-python-magic
-PyPDF2
 aiofiles
 evaluate
 rouge_score
-bert_score
-mdc
-langchain_experimental
-python-dotenv==1.0.1
 pyyaml==6.0.1
-coloredlogs==15.0.1
-sentence_transformers
 faiss-gpu
 unstructured[pdf]
 langchain_openai