11 months ago · 77d3544c81
--- a/end-to-end-use-cases/coding/text2sql/eval/llama_eval.sh
+++ b/end-to-end-use-cases/coding/text2sql/eval/llama_eval.sh
@@ -1,10 +1,13 @@
 
				+# Set to "true" to enable debug mode with detailed prints
			
 
				+DEBUG_MODE="false"
			
 
				+
			
 
				 eval_path='../data/dev_20240627/dev.json'
			
 
				 db_root_path='../data/dev_20240627/dev_databases/'
			
 
				 ground_truth_path='../data/'
			
 
				 
			
 
				 # Llama models on Llama API
			
 
				-YOUR_API_KEY='YOUR_LLAMA_API_KEY'
			
 
				-model='Llama-3.3-8B-Instruct'
			
 
				+# YOUR_API_KEY='YOUR_LLAMA_API_KEY'
			
 
				+# model='Llama-3.3-8B-Instruct'
			
 
				 #model='Llama-3.3-70B-Instruct'
			
 
				 #model='Llama-4-Maverick-17B-128E-Instruct-FP8'
			
 
				 #model='Llama-4-Scout-17B-16E-Instruct-FP8'
			
@@ -14,8 +17,8 @@ model='Llama-3.3-8B-Instruct'
 
				 # model='meta-llama/Llama-3.1-8B-Instruct'
			
 
				 
			
 
				 # Fine-tuned Llama models locally
			
 
				-# YOUR_API_KEY='finetuned'
			
 
				-# model='../fine-tuning/final_test/llama31-8b-text2sql-peft-quantized-cot_merged'
			
 
				+YOUR_API_KEY='finetuned'
			
 
				+model='../fine-tuning/llama31-8b-text2sql-fft-nonquantized-cot-epochs-3'
			
 
				 
			
 
				 data_output_path="./output/$model/"
			
 
				 
			
@@ -26,9 +29,17 @@ python3 -u llama_text2sql.py --db_root_path ${db_root_path} --api_key ${YOUR_API
 
				 # Check if llama_text2sql.py exited successfully
			
 
				 if [ $? -eq 0 ]; then
			
 
				     echo "llama_text2sql.py completed successfully. Proceeding with evaluation..."
			
 
				-    python3 -u text2sql_eval.py --db_root_path ${db_root_path} --predicted_sql_path ${data_output_path} \
			
 
				-    --ground_truth_path ${ground_truth_path} \
			
 
				-    --diff_json_path ${eval_path}
			
 
				+
			
 
				+    # Add --debug flag if DEBUG_MODE is true
			
 
				+    if [ "$DEBUG_MODE" = "true" ]; then
			
 
				+        python3 -u text2sql_eval.py --db_root_path ${db_root_path} --predicted_sql_path ${data_output_path} \
			
 
				+        --ground_truth_path ${ground_truth_path} \
			
 
				+        --diff_json_path ${eval_path} --debug
			
 
				+    else
			
 
				+        python3 -u text2sql_eval.py --db_root_path ${db_root_path} --predicted_sql_path ${data_output_path} \
			
 
				+        --ground_truth_path ${ground_truth_path} \
			
 
				+        --diff_json_path ${eval_path}
			
 
				+    fi
			
 
				 
			
 
				     echo "Done evaluating $model."
			
 
				 
			
--- a/end-to-end-use-cases/coding/text2sql/eval/llama_text2sql.py
+++ b/end-to-end-use-cases/coding/text2sql/eval/llama_text2sql.py
@@ -1,4 +1,5 @@
 
				 import argparse
			
 
				+import concurrent.futures
			
 
				 import json
			
 
				 import os
			
 
				 import re
			
@@ -6,6 +7,7 @@ import sqlite3
 
				 from typing import Dict
			
 
				 
			
 
				 from llama_api_client import LlamaAPIClient
			
 
				+from tqdm import tqdm
			
 
				 
			
 
				 MAX_NEW_TOKENS = 10240  # If API has max tokens (vs max new tokens), we calculate it
			
 
				 TIMEOUT = 60  # Timeout in seconds for each API call
			
@@ -25,20 +27,98 @@ def local_llama(client, prompt, model):
 
				         model=model,
			
 
				         messages=messages,
			
 
				         timeout=TIMEOUT,
			
 
				+        temperature=0,
			
 
				     )
			
 
				     answer = chat_response.choices[0].message.content.strip()
			
 
				 
			
 
				     pattern = re.compile(r"```sql\n*(.*?)```", re.DOTALL)
			
 
				     matches = pattern.findall(answer)
			
 
				-    if matches != []:
			
 
				-        result = matches[0]
			
 
				-    else:
			
 
				+    if not matches:
			
 
				         result = answer
			
 
				+    else:
			
 
				+        result = matches[0]
			
 
				 
			
 
				     print(f"{result=}")
			
 
				     return result
			
 
				 
			
 
				 
			
 
				+def batch_local_llama(client, prompts, model, max_workers=8):
			
 
				+    """
			
 
				+    Process multiple prompts in parallel using the vllm server.
			
 
				+
			
 
				+    Args:
			
 
				+        client: OpenAI client
			
 
				+        prompts: List of prompts to process
			
 
				+        model: Model name
			
 
				+        max_workers: Maximum number of parallel workers
			
 
				+
			
 
				+    Returns:
			
 
				+        List of results in the same order as prompts
			
 
				+    """
			
 
				+    SYSTEM_PROMPT = (
			
 
				+        "You are a text to SQL query translator. Using the SQLite DB Schema "
			
 
				+        "and the External Knowledge, translate the following text question "
			
 
				+        "into a SQLite SQL select statement."
			
 
				+    )
			
 
				+
			
 
				+    def process_single_prompt(prompt):
			
 
				+        messages = [
			
 
				+            {"content": SYSTEM_PROMPT, "role": "system"},
			
 
				+            {"role": "user", "content": prompt},
			
 
				+        ]
			
 
				+        try:
			
 
				+            chat_response = client.chat.completions.create(
			
 
				+                model=model,
			
 
				+                messages=messages,
			
 
				+                timeout=TIMEOUT,
			
 
				+                temperature=0,
			
 
				+            )
			
 
				+            answer = chat_response.choices[0].message.content.strip()
			
 
				+
			
 
				+            pattern = re.compile(r"```sql\n*(.*?)```", re.DOTALL)
			
 
				+            matches = pattern.findall(answer)
			
 
				+            if not matches:
			
 
				+                result = answer
			
 
				+            else:
			
 
				+                result = matches[0]
			
 
				+
			
 
				+            return result
			
 
				+        except Exception as e:
			
 
				+            print(f"Error processing prompt: {e}")
			
 
				+            return f"error:{e}"
			
 
				+
			
 
				+    print(
			
 
				+        f"batch_local_llama: Processing {len(prompts)} prompts with {model=} "
			
 
				+        f"using {max_workers} workers"
			
 
				+    )
			
 
				+    results = []
			
 
				+
			
 
				+    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
			
 
				+        # Submit all tasks and create a map of futures to their indices
			
 
				+        future_to_index = {
			
 
				+            executor.submit(process_single_prompt, prompt): i
			
 
				+            for i, prompt in enumerate(prompts)
			
 
				+        }
			
 
				+
			
 
				+        # Initialize results list with None values
			
 
				+        results = [None] * len(prompts)
			
 
				+
			
 
				+        # Process completed futures as they complete
			
 
				+        for future in tqdm(
			
 
				+            concurrent.futures.as_completed(future_to_index),
			
 
				+            total=len(prompts),
			
 
				+            desc="Processing prompts",
			
 
				+        ):
			
 
				+            index = future_to_index[future]
			
 
				+            try:
			
 
				+                results[index] = future.result()
			
 
				+            except Exception as e:
			
 
				+                print(f"Error processing prompt at index {index}: {e}")
			
 
				+                results[index] = f"error:{e}"
			
 
				+
			
 
				+    return results
			
 
				+
			
 
				+
			
 
				 def new_directory(path):
			
 
				     if not os.path.exists(path):
			
 
				         os.makedirs(path)
			
@@ -235,7 +315,7 @@ def collect_response_from_llama(
 
				                 temperature=0,
			
 
				                 stop=["--", "\n\n", ";", "#"],
			
 
				             )
			
 
				-        if type(plain_result) == str:
			
 
				+        if isinstance(plain_result, str):
			
 
				             sql = plain_result
			
 
				         else:
			
 
				             sql = "SELECT" + plain_result["choices"][0]["text"]
			
@@ -250,6 +330,89 @@ def collect_response_from_llama(
 
				     return response_list
			
 
				 
			
 
				 
			
 
				+def batch_collect_response_from_llama(
			
 
				+    db_path_list, question_list, api_key, model, knowledge_list=None, batch_size=8
			
 
				+):
			
 
				+    """
			
 
				+    Process multiple questions in parallel using the vllm server.
			
 
				+
			
 
				+    Args:
			
 
				+        db_path_list: List of database paths
			
 
				+        question_list: List of questions
			
 
				+        api_key: API key
			
 
				+        model: Model name
			
 
				+        knowledge_list: List of knowledge strings (optional)
			
 
				+        batch_size: Number of parallel requests
			
 
				+
			
 
				+    Returns:
			
 
				+        List of SQL responses
			
 
				+    """
			
 
				+    if api_key in ["huggingface", "finetuned"]:
			
 
				+        from openai import OpenAI
			
 
				+
			
 
				+        openai_api_key = "EMPTY"
			
 
				+        openai_api_base = "http://localhost:8000/v1"
			
 
				+
			
 
				+        client = OpenAI(
			
 
				+            api_key=openai_api_key,
			
 
				+            base_url=openai_api_base,
			
 
				+        )
			
 
				+    else:
			
 
				+        client = LlamaAPIClient()
			
 
				+
			
 
				+    # Generate all prompts first
			
 
				+    prompts = []
			
 
				+    for i, question in enumerate(question_list):
			
 
				+        if knowledge_list:
			
 
				+            cur_prompt = generate_combined_prompts_one(
			
 
				+                db_path=db_path_list[i], question=question, knowledge=knowledge_list[i]
			
 
				+            )
			
 
				+        else:
			
 
				+            cur_prompt = generate_combined_prompts_one(
			
 
				+                db_path=db_path_list[i], question=question
			
 
				+            )
			
 
				+        prompts.append(cur_prompt)
			
 
				+
			
 
				+    print(f"Generated {len(prompts)} prompts for batch processing")
			
 
				+
			
 
				+    # Process prompts in parallel
			
 
				+    if api_key in ["huggingface", "finetuned"]:
			
 
				+        results = batch_local_llama(
			
 
				+            client=client, prompts=prompts, model=model, max_workers=batch_size
			
 
				+        )
			
 
				+    else:
			
 
				+        # For cloud API, we could implement a batch version of cloud_llama if needed
			
 
				+        # For now, just process sequentially
			
 
				+        results = []
			
 
				+        for prompt in prompts:
			
 
				+            plain_result = cloud_llama(
			
 
				+                client=client,
			
 
				+                api_key=api_key,
			
 
				+                model=model,
			
 
				+                prompt=prompt,
			
 
				+                max_tokens=10240,
			
 
				+                temperature=0,
			
 
				+                stop=["--", "\n\n", ";", "#"],
			
 
				+            )
			
 
				+            results.append(plain_result)
			
 
				+
			
 
				+    # Format results
			
 
				+    response_list = []
			
 
				+    for i, result in enumerate(results):
			
 
				+        if isinstance(result, str):
			
 
				+            sql = result
			
 
				+        else:
			
 
				+            sql = "SELECT" + result["choices"][0]["text"]
			
 
				+
			
 
				+        db_id = db_path_list[i].split("/")[-1].split(".sqlite")[0]
			
 
				+        sql = (
			
 
				+            sql + "\t----- bird -----\t" + db_id
			
 
				+        )  # to avoid unpredicted \t appearing in codex results
			
 
				+        response_list.append(sql)
			
 
				+
			
 
				+    return response_list
			
 
				+
			
 
				+
			
 
				 def question_package(data_json, knowledge=False):
			
 
				     question_list = []
			
 
				     for data in data_json:
			
@@ -302,9 +465,18 @@ if __name__ == "__main__":
 
				     args_parser.add_argument("--api_key", type=str, required=True)
			
 
				     args_parser.add_argument("--model", type=str, required=True)
			
 
				     args_parser.add_argument("--data_output_path", type=str)
			
 
				+    args_parser.add_argument(
			
 
				+        "--batch_size",
			
 
				+        type=int,
			
 
				+        default=8,
			
 
				+        help="Number of parallel requests for batch processing",
			
 
				+    )
			
 
				+    args_parser.add_argument(
			
 
				+        "--use_batch", type=str, default="True", help="Whether to use batch processing"
			
 
				+    )
			
 
				     args = args_parser.parse_args()
			
 
				 
			
 
				-    if not args.api_key in ["huggingface", "finetuned"]:
			
 
				+    if args.api_key not in ["huggingface", "finetuned"]:
			
 
				         os.environ["LLAMA_API_KEY"] = args.api_key
			
 
				 
			
 
				         try:
			
@@ -332,22 +504,46 @@ if __name__ == "__main__":
 
				     )
			
 
				     assert len(question_list) == len(db_path_list) == len(knowledge_list)
			
 
				 
			
 
				-    if args.use_knowledge == "True":
			
 
				-        responses = collect_response_from_llama(  # collect_batch_response_from_llama
			
 
				-            db_path_list=db_path_list,
			
 
				-            question_list=question_list,
			
 
				-            api_key=args.api_key,
			
 
				-            model=args.model,
			
 
				-            knowledge_list=knowledge_list,
			
 
				-        )
			
 
				+    use_batch = args.use_batch.lower() == "true"
			
 
				+
			
 
				+    if use_batch:
			
 
				+        print(f"Using batch processing with batch_size={args.batch_size}")
			
 
				+        if args.use_knowledge == "True":
			
 
				+            responses = batch_collect_response_from_llama(
			
 
				+                db_path_list=db_path_list,
			
 
				+                question_list=question_list,
			
 
				+                api_key=args.api_key,
			
 
				+                model=args.model,
			
 
				+                knowledge_list=knowledge_list,
			
 
				+                batch_size=args.batch_size,
			
 
				+            )
			
 
				+        else:
			
 
				+            responses = batch_collect_response_from_llama(
			
 
				+                db_path_list=db_path_list,
			
 
				+                question_list=question_list,
			
 
				+                api_key=args.api_key,
			
 
				+                model=args.model,
			
 
				+                knowledge_list=None,
			
 
				+                batch_size=args.batch_size,
			
 
				+            )
			
 
				     else:
			
 
				-        responses = collect_response_from_llama(
			
 
				-            db_path_list=db_path_list,
			
 
				-            question_list=question_list,
			
 
				-            api_key=args.api_key,
			
 
				-            model=args.model,
			
 
				-            knowledge_list=None,
			
 
				-        )
			
 
				+        print("Using sequential processing")
			
 
				+        if args.use_knowledge == "True":
			
 
				+            responses = collect_response_from_llama(
			
 
				+                db_path_list=db_path_list,
			
 
				+                question_list=question_list,
			
 
				+                api_key=args.api_key,
			
 
				+                model=args.model,
			
 
				+                knowledge_list=knowledge_list,
			
 
				+            )
			
 
				+        else:
			
 
				+            responses = collect_response_from_llama(
			
 
				+                db_path_list=db_path_list,
			
 
				+                question_list=question_list,
			
 
				+                api_key=args.api_key,
			
 
				+                model=args.model,
			
 
				+                knowledge_list=None,
			
 
				+            )
			
 
				 
			
 
				     output_name = args.data_output_path + "predict_" + args.mode + ".json"
			
 
				 
			
--- a/end-to-end-use-cases/coding/text2sql/eval/text2sql_eval.py
+++ b/end-to-end-use-cases/coding/text2sql/eval/text2sql_eval.py
@@ -5,6 +5,7 @@ import sqlite3
 
				 import sys
			
 
				 
			
 
				 from func_timeout import func_timeout, FunctionTimedOut
			
 
				+from tqdm import tqdm
			
 
				 
			
 
				 
			
 
				 def load_json(dir):
			
@@ -17,7 +18,7 @@ def result_callback(result):
 
				     exec_result.append(result)
			
 
				 
			
 
				 
			
 
				-def execute_sql(predicted_sql, ground_truth, db_path):
			
 
				+def execute_sql(predicted_sql, ground_truth, db_path, debug=False):
			
 
				     conn = sqlite3.connect(db_path)
			
 
				     # Connect to the database
			
 
				     cursor = conn.cursor()
			
@@ -28,7 +29,7 @@ def execute_sql(predicted_sql, ground_truth, db_path):
 
				     res = 0
			
 
				     if set(predicted_res) == set(ground_truth_res):
			
 
				         res = 1
			
 
				-    else:
			
 
				+    elif debug:
			
 
				         print(
			
 
				             f"\n\n==== INCORRECT SQL GENERATED ====\n{predicted_sql=}\n{predicted_res=}\n{ground_truth=}\n{ground_truth_res=}\n======\n\n"
			
 
				         )
			
@@ -36,10 +37,14 @@ def execute_sql(predicted_sql, ground_truth, db_path):
 
				     return res
			
 
				 
			
 
				 
			
 
				-def execute_model(predicted_sql, ground_truth, db_place, idx, meta_time_out):
			
 
				+def execute_model(
			
 
				+    predicted_sql, ground_truth, db_place, idx, meta_time_out, debug=False
			
 
				+):
			
 
				     try:
			
 
				         res = func_timeout(
			
 
				-            meta_time_out, execute_sql, args=(predicted_sql, ground_truth, db_place)
			
 
				+            meta_time_out,
			
 
				+            execute_sql,
			
 
				+            args=(predicted_sql, ground_truth, db_place, debug),
			
 
				         )
			
 
				     except KeyboardInterrupt:
			
 
				         sys.exit(0)
			
@@ -79,19 +84,35 @@ def package_sqls(sql_path, db_root_path, mode="gpt", data_mode="dev"):
 
				     return clean_sqls, db_path_list
			
 
				 
			
 
				 
			
 
				-def run_sqls_parallel(sqls, db_places, num_cpus=1, meta_time_out=30.0):
			
 
				+def run_sqls_parallel(sqls, db_places, num_cpus=1, meta_time_out=30.0, debug=False):
			
 
				     pool = mp.Pool(processes=num_cpus)
			
 
				-    for i, sql_pair in enumerate(sqls):
			
 
				 
			
 
				+    # Create a progress bar if not in debug mode
			
 
				+    if not debug:
			
 
				+        pbar = tqdm(total=len(sqls), desc="Evaluating SQL queries")
			
 
				+
			
 
				+    for i, sql_pair in enumerate(sqls):
			
 
				         predicted_sql, ground_truth = sql_pair
			
 
				         pool.apply_async(
			
 
				             execute_model,
			
 
				-            args=(predicted_sql, ground_truth, db_places[i], i, meta_time_out),
			
 
				-            callback=result_callback,
			
 
				+            args=(predicted_sql, ground_truth, db_places[i], i, meta_time_out, debug),
			
 
				+            callback=lambda result: result_callback_with_progress(
			
 
				+                result, not debug, pbar
			
 
				+            ),
			
 
				         )
			
 
				     pool.close()
			
 
				     pool.join()
			
 
				 
			
 
				+    # Close the progress bar if not in debug mode
			
 
				+    if not debug:
			
 
				+        pbar.close()
			
 
				+
			
 
				+
			
 
				+def result_callback_with_progress(result, use_progress, pbar=None):
			
 
				+    exec_result.append(result)
			
 
				+    if use_progress and pbar:
			
 
				+        pbar.update(1)
			
 
				+
			
 
				 
			
 
				 def sort_results(list_of_dicts):
			
 
				     return sorted(list_of_dicts, key=lambda x: x["sql_idx"])
			
@@ -137,14 +158,19 @@ def compute_acc_by_diff(exec_results, diff_json_path):
 
				     )
			
 
				 
			
 
				 
			
 
				-def print_data(score_lists, count_lists):
			
 
				+def print_data(score_lists, count_lists, debug=False):
			
 
				     levels = ["simple", "moderate", "challenging", "total"]
			
 
				-    print("{:20} {:20} {:20} {:20} {:20}".format("", *levels))
			
 
				-    print("{:20} {:<20} {:<20} {:<20} {:<20}".format("count", *count_lists))
			
 
				 
			
 
				-    print(
			
 
				-        "======================================    ACCURACY    ====================================="
			
 
				-    )
			
 
				+    if debug:
			
 
				+        print("{:20} {:20} {:20} {:20} {:20}".format("", *levels))
			
 
				+        print("{:20} {:<20} {:<20} {:<20} {:<20}".format("count", *count_lists))
			
 
				+        print(
			
 
				+            "======================================    ACCURACY    ====================================="
			
 
				+        )
			
 
				+    else:
			
 
				+        print("\nEvaluation Results:")
			
 
				+        print("-" * 40)
			
 
				+
			
 
				     print(
			
 
				         "{:20} {:<20.2f} {:<20.2f} {:<20.2f} {:<20.2f}".format("accuracy", *score_lists)
			
 
				     )
			
@@ -164,9 +190,19 @@ if __name__ == "__main__":
 
				     args_parser.add_argument("--mode_predict", type=str, default="gpt")
			
 
				     args_parser.add_argument("--difficulty", type=str, default="simple")
			
 
				     args_parser.add_argument("--diff_json_path", type=str, default="")
			
 
				+    args_parser.add_argument(
			
 
				+        "--debug", action="store_true", help="Enable debug mode with detailed prints"
			
 
				+    )
			
 
				     args = args_parser.parse_args()
			
 
				     exec_result = []
			
 
				 
			
 
				+    if args.debug:
			
 
				+        print("Debug mode enabled - showing detailed output")
			
 
				+
			
 
				+    # Show loading progress if not in debug mode
			
 
				+    if not args.debug:
			
 
				+        print("Loading SQL queries and database paths...")
			
 
				+
			
 
				     pred_queries, db_paths = package_sqls(
			
 
				         args.predicted_sql_path,
			
 
				         args.db_root_path,
			
@@ -179,20 +215,29 @@ if __name__ == "__main__":
 
				     )
			
 
				 
			
 
				     query_pairs = list(zip(pred_queries, gt_queries))
			
 
				+
			
 
				+    if args.debug:
			
 
				+        print(f"Executing {len(query_pairs)} SQL query pairs...")
			
 
				+
			
 
				     run_sqls_parallel(
			
 
				         query_pairs,
			
 
				         db_places=db_paths,
			
 
				         num_cpus=args.num_cpus,
			
 
				         meta_time_out=args.meta_time_out,
			
 
				+        debug=args.debug,
			
 
				     )
			
 
				     exec_result = sort_results(exec_result)
			
 
				 
			
 
				-    print("Evaluating statistics...")
			
 
				+    if args.debug:
			
 
				+        print("Evaluating statistics...")
			
 
				+
			
 
				     simple_acc, moderate_acc, challenging_acc, acc, count_lists = compute_acc_by_diff(
			
 
				         exec_result, args.diff_json_path
			
 
				     )
			
 
				     score_lists = [simple_acc, moderate_acc, challenging_acc, acc]
			
 
				-    print_data(score_lists, count_lists)
			
 
				-    print(
			
 
				-        "==========================================================================================="
			
 
				-    )
			
 
				+    print_data(score_lists, count_lists, debug=args.debug)
			
 
				+
			
 
				+    if args.debug:
			
 
				+        print(
			
 
				+            "==========================================================================================="
			
 
				+        )