10 месяцев назад · 12a6dfa2ac
--- a/end-to-end-use-cases/coding/text2sql/eval/llama_text2sql.py
+++ b/end-to-end-use-cases/coding/text2sql/eval/llama_text2sql.py
@@ -13,36 +13,7 @@ MAX_NEW_TOKENS = 10240  # If API has max tokens (vs max new tokens), we calculat
 
																 TIMEOUT = 60  # Timeout in seconds for each API call
															
 
																-def local_llama(client, prompt, model):
															
 
																-    SYSTEM_PROMPT = "You are a text to SQL query translator. Using the SQLite DB Schema and the External Knowledge, translate the following text question into a SQLite SQL select statement."
															
 
																-    # UNCOMMENT TO USE THE FINE_TUNED MODEL WITH REASONING DATASET
															
 
																-    # SYSTEM_PROMPT = "You are a text to SQL query translator. Using the SQLite DB Schema and the External Knowledge, generate the step-by-step reasoning and the final SQLite SQL select statement from the text question."
															
 
																-
															
 
																-    messages = [
															
 
																-        {"content": SYSTEM_PROMPT, "role": "system"},
															
 
																-        {"role": "user", "content": prompt},
															
 
																-    ]
															
 
																-    print(f"local_llama: {model=}")
															
 
																-    chat_response = client.chat.completions.create(
															
 
																-        model=model,
															
 
																-        messages=messages,
															
 
																-        timeout=TIMEOUT,
															
 
																-        temperature=0,
															
 
																-    )
															
 
																-    answer = chat_response.choices[0].message.content.strip()
															
 
																-
															
 
																-    pattern = re.compile(r"```sql\n*(.*?)```", re.DOTALL)
															
 
																-    matches = pattern.findall(answer)
															
 
																-    if not matches:
															
 
																-        result = answer
															
 
																-    else:
															
 
																-        result = matches[0]
															
 
																-
															
 
																-    print(f"{result=}")
															
 
																-    return result
															
 
																-
															
 
																-
															
 
																-def batch_local_llama(client, prompts, model, max_workers=8):
															
 
																+def local_llama(client, api_key, prompts, model, max_workers=8):
															
 
																     """
															
 
																     Process multiple prompts in parallel using the vllm server.
															
@@ -55,10 +26,19 @@ def batch_local_llama(client, prompts, model, max_workers=8):
 
																     Returns:
															
 
																         List of results in the same order as prompts
															
 
																     """
															
 
																+
															
 
																     SYSTEM_PROMPT = (
															
 
																-        "You are a text to SQL query translator. Using the SQLite DB Schema "
															
 
																-        "and the External Knowledge, translate the following text question "
															
 
																-        "into a SQLite SQL select statement."
															
 
																+        (
															
 
																+            "You are a text to SQL query translator. Using the SQLite DB Schema "
															
 
																+            "and the External Knowledge, translate the following text question "
															
 
																+            "into a SQLite SQL select statement."
															
 
																+        )
															
 
																+        if api_key == "huggingface"
															
 
																+        else (
															
 
																+            "You are a text to SQL query translator. Using the SQLite DB Schema "
															
 
																+            "and the External Knowledge, generate the step-by-step reasoning and "
															
 
																+            "then the final SQLite SQL select statement from the text question."
															
 
																+        )
															
 
																     )
															
 
																     def process_single_prompt(prompt):
															
@@ -88,7 +68,7 @@ def batch_local_llama(client, prompts, model, max_workers=8):
 
																             return f"error:{e}"
															
 
																     print(
															
 
																-        f"batch_local_llama: Processing {len(prompts)} prompts with {model=} "
															
 
																+        f"local_llama: Processing {len(prompts)} prompts with {model=} "
															
 
																         f"using {max_workers} workers"
															
 
																     )
															
 
																     results = []
															
@@ -231,103 +211,61 @@ def generate_combined_prompts_one(db_path, question, knowledge=None):
 
																     return combined_prompts
															
 
																-def cloud_llama(client, api_key, model, prompt, max_tokens, temperature, stop):
															
 
																-
															
 
																-    SYSTEM_PROMPT = "You are a text to SQL query translator. Using the SQLite DB Schema and the External Knowledge, translate the following text question into a SQLite SQL select statement."
															
 
																-    try:
															
 
																-        messages = [
															
 
																-            {"content": SYSTEM_PROMPT, "role": "system"},
															
 
																-            {"role": "user", "content": prompt},
															
 
																-        ]
															
 
																-        final_max_tokens = len(messages) + MAX_NEW_TOKENS
															
 
																-        response = client.chat.completions.create(
															
 
																-            model=model,
															
 
																-            messages=messages,
															
 
																-            temperature=0,
															
 
																-            max_completion_tokens=final_max_tokens,
															
 
																-        )
															
 
																-        answer = response.completion_message.content.text
															
 
																-
															
 
																-        pattern = re.compile(r"```sql\n*(.*?)```", re.DOTALL)
															
 
																-        matches = pattern.findall(answer)
															
 
																-        if matches != []:
															
 
																-            result = matches[0]
															
 
																-        else:
															
 
																-            result = answer
															
 
																-
															
 
																-        print(result)
															
 
																-    except Exception as e:
															
 
																-        result = "error:{}".format(e)
															
 
																-        print(f"{result=}")
															
 
																-    return result
															
 
																-
															
 
																-
															
 
																-def collect_response_from_llama(
															
 
																-    db_path_list, question_list, api_key, model, knowledge_list=None
															
 
																-):
															
 
																+def cloud_llama(client, api_key, model, prompts):
															
 
																     """
															
 
																-    :param db_path: str
															
 
																-    :param question_list: []
															
 
																-    :return: dict of responses
															
 
																-    """
															
 
																-    response_list = []
															
 
																-
															
 
																-    if api_key in ["huggingface", "finetuned"]:
															
 
																-        from openai import OpenAI
															
 
																+    Process multiple prompts sequentially using the cloud API, showing progress with tqdm.
															
 
																-        openai_api_key = "EMPTY"
															
 
																-        openai_api_base = "http://localhost:8000/v1"
															
 
																+    Args:
															
 
																+        client: LlamaAPIClient
															
 
																+        api_key: API key
															
 
																+        model: Model name
															
 
																+        prompts: List of prompts to process (or a single prompt as string)
															
 
																-        client = OpenAI(
															
 
																-            api_key=openai_api_key,
															
 
																-            base_url=openai_api_base,
															
 
																-        )
															
 
																-    else:
															
 
																-        client = LlamaAPIClient()
															
 
																+    Returns:
															
 
																+        List of results if prompts is a list, or a single result if prompts is a string
															
 
																+    """
															
 
																+    SYSTEM_PROMPT = "You are a text to SQL query translator. Using the SQLite DB Schema and the External Knowledge, translate the following text question into a SQLite SQL select statement."
															
 
																-    for i, question in enumerate(question_list):
															
 
																-        print(
															
 
																-            "--------------------- processing question #{}---------------------".format(
															
 
																-                i + 1
															
 
																-            )
															
 
																-        )
															
 
																-        print("the question is: {}".format(question))
															
 
																+    # Handle the case where a single prompt is passed
															
 
																+    single_prompt = False
															
 
																+    if isinstance(prompts, str):
															
 
																+        prompts = [prompts]
															
 
																+        single_prompt = True
															
 
																-        if knowledge_list:
															
 
																-            cur_prompt = generate_combined_prompts_one(
															
 
																-                db_path=db_path_list[i], question=question, knowledge=knowledge_list[i]
															
 
																-            )
															
 
																-        else:
															
 
																-            cur_prompt = generate_combined_prompts_one(
															
 
																-                db_path=db_path_list[i], question=question
															
 
																-            )
															
 
																-
															
 
																-        if api_key in ["huggingface", "finetuned"]:
															
 
																-            plain_result = local_llama(client=client, prompt=cur_prompt, model=model)
															
 
																-        else:
															
 
																+    results = []
															
 
																-            plain_result = cloud_llama(
															
 
																-                client=client,
															
 
																-                api_key=api_key,
															
 
																+    # Process each prompt sequentially with tqdm progress bar
															
 
																+    for prompt in tqdm(prompts, desc="Processing prompts", unit="prompt"):
															
 
																+        try:
															
 
																+            messages = [
															
 
																+                {"content": SYSTEM_PROMPT, "role": "system"},
															
 
																+                {"role": "user", "content": prompt},
															
 
																+            ]
															
 
																+            final_max_tokens = len(messages) + MAX_NEW_TOKENS
															
 
																+            response = client.chat.completions.create(
															
 
																                 model=model,
															
 
																-                prompt=cur_prompt,
															
 
																-                max_tokens=10240,
															
 
																+                messages=messages,
															
 
																                 temperature=0,
															
 
																-                stop=["--", "\n\n", ";", "#"],
															
 
																+                max_completion_tokens=final_max_tokens,
															
 
																             )
															
 
																-        if isinstance(plain_result, str):
															
 
																-            sql = plain_result
															
 
																-        else:
															
 
																-            sql = "SELECT" + plain_result["choices"][0]["text"]
															
 
																+            answer = response.completion_message.content.text
															
 
																-        # responses_dict[i] = sql
															
 
																-        db_id = db_path_list[i].split("/")[-1].split(".sqlite")[0]
															
 
																-        sql = (
															
 
																-            sql + "\t----- bird -----\t" + db_id
															
 
																-        )  # to avoid unpredicted \t appearing in codex results
															
 
																-        response_list.append(sql)
															
 
																+            pattern = re.compile(r"```sql\n*(.*?)```", re.DOTALL)
															
 
																+            matches = pattern.findall(answer)
															
 
																+            if matches != []:
															
 
																+                result = matches[0]
															
 
																+            else:
															
 
																+                result = answer
															
 
																+        except Exception as e:
															
 
																+            result = "error:{}".format(e)
															
 
																+            print(f"{result=}")
															
 
																-    return response_list
															
 
																+        results.append(result)
															
 
																+
															
 
																+    # Return a single result if input was a single prompt
															
 
																+    if single_prompt:
															
 
																+        return results[0]
															
 
																+    return results
															
 
																 def batch_collect_response_from_llama(
															
@@ -376,25 +314,24 @@ def batch_collect_response_from_llama(
 
																     print(f"Generated {len(prompts)} prompts for batch processing")
															
 
																     # Process prompts in parallel
															
 
																-    if api_key in ["huggingface", "finetuned"]:
															
 
																-        results = batch_local_llama(
															
 
																-            client=client, prompts=prompts, model=model, max_workers=batch_size
															
 
																+    if api_key in [
															
 
																+        "huggingface",
															
 
																+        "finetuned",
															
 
																+    ]:  # running vllm on multiple GPUs to see best performance
															
 
																+        results = local_llama(
															
 
																+            client=client,
															
 
																+            api_key=api_key,
															
 
																+            prompts=prompts,
															
 
																+            model=model,
															
 
																+            max_workers=batch_size,
															
 
																         )
															
 
																     else:
															
 
																-        # For cloud API, we could implement a batch version of cloud_llama if needed
															
 
																-        # For now, just process sequentially
															
 
																-        results = []
															
 
																-        for prompt in prompts:
															
 
																-            plain_result = cloud_llama(
															
 
																-                client=client,
															
 
																-                api_key=api_key,
															
 
																-                model=model,
															
 
																-                prompt=prompt,
															
 
																-                max_tokens=10240,
															
 
																-                temperature=0,
															
 
																-                stop=["--", "\n\n", ";", "#"],
															
 
																-            )
															
 
																-            results.append(plain_result)
															
 
																+        results = cloud_llama(
															
 
																+            client=client,
															
 
																+            api_key=api_key,
															
 
																+            model=model,
															
 
																+            prompts=prompts,
															
 
																+        )
															
 
																     # Format results
															
 
																     response_list = []
															
@@ -471,9 +408,6 @@ if __name__ == "__main__":
 
																         default=8,
															
 
																         help="Number of parallel requests for batch processing",
															
 
																     )
															
 
																-    args_parser.add_argument(
															
 
																-        "--use_batch", type=str, default="True", help="Whether to use batch processing"
															
 
																-    )
															
 
																     args = args_parser.parse_args()
															
 
																     if args.api_key not in ["huggingface", "finetuned"]:
															
@@ -488,62 +422,36 @@ if __name__ == "__main__":
 
																                 temperature=0,
															
 
																             )
															
 
																             answer = response.completion_message.content.text
															
 
																-
															
 
																-            print(f"{answer=}")
															
 
																         except Exception as exception:
															
 
																             print(f"{exception=}")
															
 
																             exit(1)
															
 
																     eval_data = json.load(open(args.eval_path, "r"))
															
 
																-    # '''for debug'''
															
 
																-    # eval_data = eval_data[:3]
															
 
																-    # '''for debug'''
															
 
																     question_list, db_path_list, knowledge_list = decouple_question_schema(
															
 
																         datasets=eval_data, db_root_path=args.db_root_path
															
 
																     )
															
 
																     assert len(question_list) == len(db_path_list) == len(knowledge_list)
															
 
																-    use_batch = args.use_batch.lower() == "true"
															
 
																-
															
 
																-    if use_batch:
															
 
																-        print(f"Using batch processing with batch_size={args.batch_size}")
															
 
																-        if args.use_knowledge == "True":
															
 
																-            responses = batch_collect_response_from_llama(
															
 
																-                db_path_list=db_path_list,
															
 
																-                question_list=question_list,
															
 
																-                api_key=args.api_key,
															
 
																-                model=args.model,
															
 
																-                knowledge_list=knowledge_list,
															
 
																-                batch_size=args.batch_size,
															
 
																-            )
															
 
																-        else:
															
 
																-            responses = batch_collect_response_from_llama(
															
 
																-                db_path_list=db_path_list,
															
 
																-                question_list=question_list,
															
 
																-                api_key=args.api_key,
															
 
																-                model=args.model,
															
 
																-                knowledge_list=None,
															
 
																-                batch_size=args.batch_size,
															
 
																-            )
															
 
																+    print(f"Using batch processing with batch_size={args.batch_size}")
															
 
																+    if args.use_knowledge == "True":
															
 
																+        responses = batch_collect_response_from_llama(
															
 
																+            db_path_list=db_path_list,
															
 
																+            question_list=question_list,
															
 
																+            api_key=args.api_key,
															
 
																+            model=args.model,
															
 
																+            knowledge_list=knowledge_list,
															
 
																+            batch_size=args.batch_size,
															
 
																+        )
															
 
																     else:
															
 
																-        print("Using sequential processing")
															
 
																-        if args.use_knowledge == "True":
															
 
																-            responses = collect_response_from_llama(
															
 
																-                db_path_list=db_path_list,
															
 
																-                question_list=question_list,
															
 
																-                api_key=args.api_key,
															
 
																-                model=args.model,
															
 
																-                knowledge_list=knowledge_list,
															
 
																-            )
															
 
																-        else:
															
 
																-            responses = collect_response_from_llama(
															
 
																-                db_path_list=db_path_list,
															
 
																-                question_list=question_list,
															
 
																-                api_key=args.api_key,
															
 
																-                model=args.model,
															
 
																-                knowledge_list=None,
															
 
																-            )
															
 
																+        responses = batch_collect_response_from_llama(
															
 
																+            db_path_list=db_path_list,
															
 
																+            question_list=question_list,
															
 
																+            api_key=args.api_key,
															
 
																+            model=args.model,
															
 
																+            knowledge_list=None,
															
 
																+            batch_size=args.batch_size,
															
 
																+        )
															
 
																     output_name = args.data_output_path + "predict_" + args.mode + ".json"
															
--- a/end-to-end-use-cases/coding/text2sql/eval/requirements.txt
+++ b/end-to-end-use-cases/coding/text2sql/eval/requirements.txt
@@ -1,5 +1,6 @@
 
																 llama_api_client==0.1.2
															
 
																 func_timeout==4.3.5
															
 
																+tqdm==4.67.1
															
 
																 # uncomment to run vllm for eval with Llama 3.1 8B on HF and its fine-tuned models
															
 
																 # vllm==0.9.2