11 ヶ月前 · 5baa1e3fd7
--- a/end-to-end-use-cases/coding/text2sql/eval/llama_eval.sh
+++ b/end-to-end-use-cases/coding/text2sql/eval/llama_eval.sh
@@ -2,14 +2,6 @@ eval_path='../data/dev_20240627/dev.json'
 
				 db_root_path='../data/dev_20240627/dev_databases/'
			
 
				 ground_truth_path='../data/'
			
 
				 
			
 
				-# Llama models on Together
			
 
				-#YOUR_API_KEY='YOUR_TOGETHER_API_KEY'
			
 
				-#model='meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo'
			
 
				-#model='meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo'
			
 
				-#model='meta-llama/Llama-3.3-70B-Instruct-Turbo'
			
 
				-#model='meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8'
			
 
				-#model='meta-llama/Llama-4-Scout-17B-16E-Instruct'
			
 
				-
			
 
				 # Llama models on Llama API
			
 
				 YOUR_API_KEY='YOUR_LLAMA_API_KEY'
			
 
				 model='Llama-3.3-8B-Instruct'
			
@@ -17,14 +9,13 @@ model='Llama-3.3-8B-Instruct'
 
				 #model='Llama-4-Maverick-17B-128E-Instruct-FP8'
			
 
				 #model='Llama-4-Scout-17B-16E-Instruct-FP8'
			
 
				 
			
 
				-# Llama model on Hugging Face Hub
			
 
				-# https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct
			
 
				+# Llama model on Hugging Face Hub https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct
			
 
				 # YOUR_API_KEY='huggingface'
			
 
				 # model='meta-llama/Llama-3.1-8B-Instruct'
			
 
				 
			
 
				 # Fine-tuned Llama models locally
			
 
				-#YOUR_API_KEY='finetuned'
			
 
				-#model='../fine_tuning/llama31-8b-text2sql-fine-tuned'
			
 
				+# YOUR_API_KEY='finetuned'
			
 
				+# model='../fine-tuning/final_test/llama31-8b-text2sql-peft-quantized-cot_merged'
			
 
				 
			
 
				 data_output_path="./output/$model/"
			
 
				 
			
--- a/end-to-end-use-cases/coding/text2sql/eval/llama_text2sql.py
+++ b/end-to-end-use-cases/coding/text2sql/eval/llama_text2sql.py
@@ -5,21 +5,13 @@ import re
 
				 import sqlite3
			
 
				 from typing import Dict
			
 
				 
			
 
				-import torch
			
 
				-from langchain_together import ChatTogether
			
 
				 from llama_api_client import LlamaAPIClient
			
 
				-from peft import AutoPeftModelForCausalLM
			
 
				-from tqdm import tqdm
			
 
				-from transformers import (
			
 
				-    AutoModelForCausalLM,
			
 
				-    AutoTokenizer,
			
 
				-    BitsAndBytesConfig,
			
 
				-    pipeline,
			
 
				-)
			
 
				-
			
 
				-MAX_NEW_TOKENS=10240  # If API has max tokens (vs max new tokens), we calculate it
			
 
				-
			
 
				-def local_llama(prompt, pipe):
			
 
				+
			
 
				+MAX_NEW_TOKENS = 10240  # If API has max tokens (vs max new tokens), we calculate it
			
 
				+TIMEOUT = 60  # Timeout in seconds for each API call
			
 
				+
			
 
				+
			
 
				+def local_llama(client, prompt, model):
			
 
				     SYSTEM_PROMPT = "You are a text to SQL query translator. Using the SQLite DB Schema and the External Knowledge, translate the following text question into a SQLite SQL select statement."
			
 
				     # UNCOMMENT TO USE THE FINE_TUNED MODEL WITH REASONING DATASET
			
 
				     # SYSTEM_PROMPT = "You are a text to SQL query translator. Using the SQLite DB Schema and the External Knowledge, generate the step-by-step reasoning and the final SQLite SQL select statement from the text question."
			
@@ -28,27 +20,13 @@ def local_llama(prompt, pipe):
 
				         {"content": SYSTEM_PROMPT, "role": "system"},
			
 
				         {"role": "user", "content": prompt},
			
 
				     ]
			
 
				-
			
 
				-    raw_prompt = pipe.tokenizer.apply_chat_template(
			
 
				-        messages,
			
 
				-        tokenize=False,
			
 
				-        add_generation_prompt=True,
			
 
				-    )
			
 
				-
			
 
				-    print(f"local_llama: {raw_prompt=}")
			
 
				-
			
 
				-    outputs = pipe(
			
 
				-        raw_prompt,
			
 
				-        max_new_tokens=MAX_NEW_TOKENS,
			
 
				-        do_sample=False,
			
 
				-        temperature=0.0,
			
 
				-        top_k=50,
			
 
				-        top_p=0.1,
			
 
				-        eos_token_id=pipe.tokenizer.eos_token_id,
			
 
				-        pad_token_id=pipe.tokenizer.pad_token_id,
			
 
				+    print(f"local_llama: {model=}")
			
 
				+    chat_response = client.chat.completions.create(
			
 
				+        model=model,
			
 
				+        messages=messages,
			
 
				+        timeout=TIMEOUT,
			
 
				     )
			
 
				-
			
 
				-    answer = outputs[0]["generated_text"][len(raw_prompt) :].strip()
			
 
				+    answer = chat_response.choices[0].message.content.strip()
			
 
				 
			
 
				     pattern = re.compile(r"```sql\n*(.*?)```", re.DOTALL)
			
 
				     matches = pattern.findall(answer)
			
@@ -98,12 +76,9 @@ def nice_look_table(column_names: list, values: list):
 
				         for i in range(len(column_names))
			
 
				     ]
			
 
				 
			
 
				-    # Print the column names
			
 
				     header = "".join(
			
 
				         f"{column.rjust(width)} " for column, width in zip(column_names, widths)
			
 
				     )
			
 
				-    # print(header)
			
 
				-    # Print the values
			
 
				     for value in values:
			
 
				         row = "".join(f"{str(v).rjust(width)} " for v, width in zip(value, widths))
			
 
				         rows.append(row)
			
@@ -176,33 +151,22 @@ def generate_combined_prompts_one(db_path, question, knowledge=None):
 
				     return combined_prompts
			
 
				 
			
 
				 
			
 
				-def cloud_llama(api_key, model, prompt, max_tokens, temperature, stop):
			
 
				+def cloud_llama(client, api_key, model, prompt, max_tokens, temperature, stop):
			
 
				 
			
 
				     SYSTEM_PROMPT = "You are a text to SQL query translator. Using the SQLite DB Schema and the External Knowledge, translate the following text question into a SQLite SQL select statement."
			
 
				     try:
			
 
				-        if model.startswith("meta-llama/"):
			
 
				-            final_prompt = SYSTEM_PROMPT + "\n\n" + prompt
			
 
				-            final_max_tokens = len(final_prompt) + MAX_NEW_TOKENS
			
 
				-            llm = ChatTogether(
			
 
				-                model=model,
			
 
				-                temperature=0,
			
 
				-                max_tokens=final_max_tokens,
			
 
				-            )
			
 
				-            answer = llm.invoke(final_prompt).content
			
 
				-        else:
			
 
				-            client = LlamaAPIClient()
			
 
				-            messages = [
			
 
				-                {"content": SYSTEM_PROMPT, "role": "system"},
			
 
				-                {"role": "user", "content": prompt},
			
 
				-            ]
			
 
				-            final_max_tokens = len(messages) + MAX_NEW_TOKENS
			
 
				-            response = client.chat.completions.create(
			
 
				-                model=model,
			
 
				-                messages=messages,
			
 
				-                temperature=0,
			
 
				-                max_completion_tokens=final_max_tokens,
			
 
				-            )
			
 
				-            answer = response.completion_message.content.text
			
 
				+        messages = [
			
 
				+            {"content": SYSTEM_PROMPT, "role": "system"},
			
 
				+            {"role": "user", "content": prompt},
			
 
				+        ]
			
 
				+        final_max_tokens = len(messages) + MAX_NEW_TOKENS
			
 
				+        response = client.chat.completions.create(
			
 
				+            model=model,
			
 
				+            messages=messages,
			
 
				+            temperature=0,
			
 
				+            max_completion_tokens=final_max_tokens,
			
 
				+        )
			
 
				+        answer = response.completion_message.content.text
			
 
				 
			
 
				         pattern = re.compile(r"```sql\n*(.*?)```", re.DOTALL)
			
 
				         matches = pattern.findall(answer)
			
@@ -218,57 +182,6 @@ def cloud_llama(api_key, model, prompt, max_tokens, temperature, stop):
 
				     return result
			
 
				 
			
 
				 
			
 
				-def huggingface_finetuned(api_key, model):
			
 
				-    if api_key == "finetuned":
			
 
				-        model_id = model
			
 
				-
			
 
				-        # Check if this is a PEFT model by looking for adapter_config.json
			
 
				-        import os
			
 
				-
			
 
				-        is_peft_model = os.path.exists(os.path.join(model_id, "adapter_config.json"))
			
 
				-
			
 
				-        if is_peft_model:
			
 
				-            # Use AutoPeftModelForCausalLM for PEFT fine-tuned models
			
 
				-            print(f"Loading PEFT model from {model_id}")
			
 
				-            model = AutoPeftModelForCausalLM.from_pretrained(
			
 
				-                model_id, device_map="auto", torch_dtype=torch.float16
			
 
				-            )
			
 
				-            tokenizer = AutoTokenizer.from_pretrained(model_id)
			
 
				-        else:
			
 
				-            # Use AutoModelForCausalLM for FFT (Full Fine-Tuning) models
			
 
				-            print(f"Loading FFT model from {model_id}")
			
 
				-            model = AutoModelForCausalLM.from_pretrained(
			
 
				-                model_id, device_map="auto", torch_dtype=torch.float16
			
 
				-            )
			
 
				-            tokenizer = AutoTokenizer.from_pretrained(model_id)
			
 
				-
			
 
				-            # For FFT models, handle pad token if it was added during training
			
 
				-            if tokenizer.pad_token is None:
			
 
				-                tokenizer.add_special_tokens({"pad_token": "[PAD]"})
			
 
				-                model.resize_token_embeddings(len(tokenizer))
			
 
				-
			
 
				-        tokenizer.padding_side = "right"  # to prevent warnings
			
 
				-
			
 
				-    elif api_key == "huggingface":
			
 
				-        model_id = model
			
 
				-        bnb_config = BitsAndBytesConfig(
			
 
				-            load_in_4bit=True,
			
 
				-            bnb_4bit_use_double_quant=True,
			
 
				-            bnb_4bit_quant_type="nf4",
			
 
				-            bnb_4bit_compute_dtype=torch.bfloat16,
			
 
				-        )
			
 
				-        model = AutoModelForCausalLM.from_pretrained(
			
 
				-            model_id,
			
 
				-            device_map="auto",
			
 
				-            torch_dtype=torch.bfloat16,
			
 
				-            quantization_config=bnb_config,  # None
			
 
				-        )
			
 
				-        tokenizer = AutoTokenizer.from_pretrained(model_id)
			
 
				-
			
 
				-    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
			
 
				-    return pipe
			
 
				-
			
 
				-
			
 
				 def collect_response_from_llama(
			
 
				     db_path_list, question_list, api_key, model, knowledge_list=None
			
 
				 ):
			
@@ -280,9 +193,19 @@ def collect_response_from_llama(
 
				     response_list = []
			
 
				 
			
 
				     if api_key in ["huggingface", "finetuned"]:
			
 
				-        pipe = huggingface_finetuned(api_key=api_key, model=model)
			
 
				+        from openai import OpenAI
			
 
				+
			
 
				+        openai_api_key = "EMPTY"
			
 
				+        openai_api_base = "http://localhost:8000/v1"
			
 
				 
			
 
				-    for i, question in tqdm(enumerate(question_list)):
			
 
				+        client = OpenAI(
			
 
				+            api_key=openai_api_key,
			
 
				+            base_url=openai_api_base,
			
 
				+        )
			
 
				+    else:
			
 
				+        client = LlamaAPIClient()
			
 
				+
			
 
				+    for i, question in enumerate(question_list):
			
 
				         print(
			
 
				             "--------------------- processing question #{}---------------------".format(
			
 
				                 i + 1
			
@@ -300,9 +223,11 @@ def collect_response_from_llama(
 
				             )
			
 
				 
			
 
				         if api_key in ["huggingface", "finetuned"]:
			
 
				-            plain_result = local_llama(prompt=cur_prompt, pipe=pipe)
			
 
				+            plain_result = local_llama(client=client, prompt=cur_prompt, model=model)
			
 
				         else:
			
 
				+
			
 
				             plain_result = cloud_llama(
			
 
				+                client=client,
			
 
				                 api_key=api_key,
			
 
				                 model=model,
			
 
				                 prompt=cur_prompt,
			
@@ -310,7 +235,7 @@ def collect_response_from_llama(
 
				                 temperature=0,
			
 
				                 stop=["--", "\n\n", ";", "#"],
			
 
				             )
			
 
				-        if isinstance(plain_result, str):
			
 
				+        if type(plain_result) == str:
			
 
				             sql = plain_result
			
 
				         else:
			
 
				             sql = "SELECT" + plain_result["choices"][0]["text"]
			
@@ -379,37 +304,23 @@ if __name__ == "__main__":
 
				     args_parser.add_argument("--data_output_path", type=str)
			
 
				     args = args_parser.parse_args()
			
 
				 
			
 
				-    if args.api_key not in ["huggingface", "finetuned"]:
			
 
				-        if args.model.startswith("meta-llama/"):  # Llama model on together
			
 
				+    if not args.api_key in ["huggingface", "finetuned"]:
			
 
				+        os.environ["LLAMA_API_KEY"] = args.api_key
			
 
				+
			
 
				+        try:
			
 
				+            client = LlamaAPIClient()
			
 
				 
			
 
				-            os.environ["TOGETHER_API_KEY"] = args.api_key
			
 
				-            llm = ChatTogether(
			
 
				+            response = client.chat.completions.create(
			
 
				                 model=args.model,
			
 
				+                messages=[{"role": "user", "content": "125*125 is?"}],
			
 
				                 temperature=0,
			
 
				             )
			
 
				-            try:
			
 
				-                response = llm.invoke("125*125 is?").content
			
 
				-                print(f"{response=}")
			
 
				-            except Exception as exception:
			
 
				-                print(f"{exception=}")
			
 
				-                exit(1)
			
 
				-        else:  # Llama model on Llama API
			
 
				-            os.environ["LLAMA_API_KEY"] = args.api_key
			
 
				-
			
 
				-            try:
			
 
				-                client = LlamaAPIClient()
			
 
				-
			
 
				-                response = client.chat.completions.create(
			
 
				-                    model=args.model,
			
 
				-                    messages=[{"role": "user", "content": "125*125 is?"}],
			
 
				-                    temperature=0,
			
 
				-                )
			
 
				-                answer = response.completion_message.content.text
			
 
				+            answer = response.completion_message.content.text
			
 
				 
			
 
				-                print(f"{answer=}")
			
 
				-            except Exception as exception:
			
 
				-                print(f"{exception=}")
			
 
				-                exit(1)
			
 
				+            print(f"{answer=}")
			
 
				+        except Exception as exception:
			
 
				+            print(f"{exception=}")
			
 
				+            exit(1)
			
 
				 
			
 
				     eval_data = json.load(open(args.eval_path, "r"))
			
 
				     # '''for debug'''
			
@@ -422,7 +333,7 @@ if __name__ == "__main__":
 
				     assert len(question_list) == len(db_path_list) == len(knowledge_list)
			
 
				 
			
 
				     if args.use_knowledge == "True":
			
 
				-        responses = collect_response_from_llama(
			
 
				+        responses = collect_response_from_llama(  # collect_batch_response_from_llama
			
 
				             db_path_list=db_path_list,
			
 
				             question_list=question_list,
			
 
				             api_key=args.api_key,
			
--- a/end-to-end-use-cases/coding/text2sql/eval/requirements.txt
+++ b/end-to-end-use-cases/coding/text2sql/eval/requirements.txt
@@ -1,19 +1,6 @@
 
				-llama_api_client==0.1.1
			
 
				-langchain-together==0.3.0
			
 
				-sqlparse==0.5.3
			
 
				-torch==2.4.1
			
 
				-tensorboard==2.19.0
			
 
				-liger-kernel==0.4.2
			
 
				-setuptools==78.1.1
			
 
				-deepspeed==0.15.4
			
 
				-transformers==4.46.3
			
 
				-datasets==3.6.0
			
 
				-accelerate==1.1.1
			
 
				-bitsandbytes==0.44.1
			
 
				-trl==0.12.1
			
 
				-peft==0.13.2
			
 
				-lighteval==0.6.2
			
 
				-hf-transfer==0.1.8
			
 
				+llama_api_client==0.1.2
			
 
				 func_timeout==4.3.5
			
 
				-vllm==0.9.2
			
 
				-flashinfer-python==0.2.7.post1
			
 
				+
			
 
				+# uncomment to run vllm for eval with Llama 3.1 8B on HF and its fine-tuned models
			
 
				+# vllm==0.9.2
			
 
				+# openai==1.90.0
			
--- a/end-to-end-use-cases/coding/text2sql/eval/text2sql_eval.py
+++ b/end-to-end-use-cases/coding/text2sql/eval/text2sql_eval.py
@@ -4,7 +4,7 @@ import multiprocessing as mp
 
				 import sqlite3
			
 
				 import sys
			
 
				 
			
 
				-from func_timeout import FunctionTimedOut, func_timeout
			
 
				+from func_timeout import func_timeout, FunctionTimedOut
			
 
				 
			
 
				 
			
 
				 def load_json(dir):
			
@@ -44,10 +44,10 @@ def execute_model(predicted_sql, ground_truth, db_place, idx, meta_time_out):
 
				     except KeyboardInterrupt:
			
 
				         sys.exit(0)
			
 
				     except FunctionTimedOut:
			
 
				-        result = [("timeout",)]
			
 
				+        result = [(f"timeout",)]
			
 
				         res = 0
			
 
				     except Exception as e:
			
 
				-        result = [("error",)]  # possibly len(query) > 512 or not executable
			
 
				+        result = [(f"{e}",)]  # possibly len(query) > 512 or not executable
			
 
				         res = 0
			
 
				     result = {"sql_idx": idx, "res": res}
			
 
				     return result
			
@@ -59,7 +59,7 @@ def package_sqls(sql_path, db_root_path, mode="gpt", data_mode="dev"):
 
				     if mode == "gpt":
			
 
				         sql_data = json.load(open(sql_path + "predict_" + data_mode + ".json", "r"))
			
 
				         for idx, sql_str in sql_data.items():
			
 
				-            if isinstance(sql_str, str):
			
 
				+            if type(sql_str) == str:
			
 
				                 sql, db_name = sql_str.split("\t----- bird -----\t")
			
 
				             else:
			
 
				                 sql, db_name = " ", "financial"
			
@@ -82,6 +82,7 @@ def package_sqls(sql_path, db_root_path, mode="gpt", data_mode="dev"):
 
				 def run_sqls_parallel(sqls, db_places, num_cpus=1, meta_time_out=30.0):
			
 
				     pool = mp.Pool(processes=num_cpus)
			
 
				     for i, sql_pair in enumerate(sqls):
			
 
				+
			
 
				         predicted_sql, ground_truth = sql_pair
			
 
				         pool.apply_async(
			
 
				             execute_model,