1 год назад · 5c24996241
--- a/end-to-end-use-cases/data-tool/ReadMe.MD
+++ b/end-to-end-use-cases/data-tool/ReadMe.MD
@@ -0,0 +1,13 @@
 
				+## WIP
			
 
				+
			
 
				+The end goal for this effort is to serve as fine-tuning data preparation kit.
			
 
				+
			
 
				+## Current status:
			
 
				+
			
 
				+Currently, I'm (WIP) evaluating the idea to improve tool-calling datasets. 
			
 
				+
			
 
				+Setup:
			
 
				+- configs: Has the config prompts for creating synthetic data using `3.3`
			
 
				+- data_prep/scripts: This is what you would like to run to prepare your datasets for annotation
			
 
				+- scripts/annotation-inference: Script for generating synthetic datasets -> Use the vllm script for inference
			
 
				+- fine-tuning: configs for FT using TorchTune
			
--- a/end-to-end-use-cases/data-tool/data_prep/scripts/data-prep-nous.py
+++ b/end-to-end-use-cases/data-tool/data_prep/scripts/data-prep-nous.py
@@ -0,0 +1,233 @@
 
				+# python data-prep-nous.py --type singleturn ~/task_datasets/1_Downloaded/hermes-function-calling-v1/func-calling-singleturn.json ~/task_datasets/2_Prepped_for_CoT/hermes-function-calling-v1/  --second-input ~/task_datasets/1_Downloaded/hermes-function-calling-v1/json-mode-agentic.json --target-size 150
			
 
				+
			
 
				+# python data-prep-nous.py ~/task_datasets/1_Downloaded/hermes-function-calling-v1/glaive-function-calling-5k.json ~/task_datasets/2_Prepped_for_CoT/hermes-function-calling-v1/glaive-balanced --type glaive --target-size 500
			
 
				+
			
 
				+# python data-prep-nous.py ~/task_datasets/1_Downloaded/hermes-function-calling-v1/json-mode-agentic.json ~/task_datasets/2_Prepped_for_CoT/balanced-json-modeagentic --type agentic --target-size 25
			
 
				+
			
 
				+# python data-prep-nous.py ~/task_datasets/1_Downloaded/hermes-function-calling-v1/func-calling.json ~/task_datasets/2_Prepped_for_CoT/balanced_func_calling --type func --target-size 25
			
 
				+
			
 
				+import argparse
			
 
				+import math
			
 
				+import os
			
 
				+import random
			
 
				+from collections import defaultdict
			
 
				+
			
 
				+import pandas as pd
			
 
				+from datasets import Dataset, DatasetDict, load_dataset
			
 
				+
			
 
				+# Category mappings
			
 
				+AGENTIC_CATEGORY_MAPPING = {
			
 
				+    "Simulacrum Agent": "Simulacra Agents",
			
 
				+    "Simulacra Agent": "Simulacra Agents",
			
 
				+    "Outlines Agents": "Outlines Agents",
			
 
				+    "Outlines Agent": "Outlines Agents",
			
 
				+    "Minecraft Agent": "Minecraft Agents",
			
 
				+    "Voyager MineCraft Agent": "Minecraft Agents",
			
 
				+    "Agent Frameworks": "Development Frameworks",
			
 
				+    "Copilot Frameworks": "Development Frameworks",
			
 
				+    "AI Analysis Agent": "Utility Agents",
			
 
				+    "Code Analysis Agent": "Utility Agents",
			
 
				+    "File Management Agent": "Utility Agents",
			
 
				+    "Utility Function": "Utility Agents",
			
 
				+    "WebBrowser Agent": "Utility Agents",
			
 
				+    "Data Structures": "Data Processing Agents",
			
 
				+    "Data Structure": "Data Processing Agents",
			
 
				+    "Data Compression": "Data Processing Agents",
			
 
				+    "DSPy Agents": "DSPy Agents",
			
 
				+    "LLM Agents": "LLM Agents",
			
 
				+    "Instructor Agents": "Instructor Agents",
			
 
				+    "Autogen Agents": "Autogen Agents",
			
 
				+    "LlamaIndex Agents": "LlamaIndex Agents",
			
 
				+    "Langchain Agents": "Langchain Agents",
			
 
				+}
			
 
				+
			
 
				+GLAIVE_CATEGORY_MAPPING = {
			
 
				+    "Technology": "tech_computing",
			
 
				+    "Programming Concepts": "tech_computing",
			
 
				+    "Programming and Computer Science Questions": "tech_computing",
			
 
				+    "Web Development and Design": "tech_computing",
			
 
				+    "Database and SQL": "tech_computing",
			
 
				+    "Swift Programming": "tech_computing",
			
 
				+    "Cybersecurity and Encryption": "tech_computing",
			
 
				+    "Data Science": "data_analytics",
			
 
				+    "Data Analysis and Programming": "data_analytics",
			
 
				+    "Machine Learning": "data_analytics",
			
 
				+    "Natural Language Processing": "data_analytics",
			
 
				+    "Stocks and Orders": "finance_business",
			
 
				+    "Loan and Financial Calculations": "finance_business",
			
 
				+    "Finance & Economics": "finance_business",
			
 
				+    "Business Strategies": "finance_business",
			
 
				+    "Science Education": "science_education",
			
 
				+    "Science and Nature Exploration": "science_education",
			
 
				+    "Quantum Physics": "science_education",
			
 
				+    "Climate and Environmental Solutions": "science_education",
			
 
				+    "Flight Services": "services_productivity",
			
 
				+    "Location Services": "services_productivity",
			
 
				+    "Productivity": "services_productivity",
			
 
				+    "Request Management": "services_productivity",
			
 
				+    "History and Culture": "knowledge_culture",
			
 
				+    "Book Search": "knowledge_culture",
			
 
				+    "Literary Analysis": "knowledge_culture",
			
 
				+    "Language and Linguistics": "knowledge_culture",
			
 
				+    "Language and Logic": "knowledge_culture",
			
 
				+}
			
 
				+
			
 
				+DEFAULT_CATEGORY = "Other"
			
 
				+
			
 
				+
			
 
				+def analyze_distribution(data, category_mapping):
			
 
				+    category_counts = defaultdict(int)
			
 
				+
			
 
				+    for item in data:
			
 
				+        category = item["category"]
			
 
				+        if category_mapping:
			
 
				+            category = category_mapping.get(category, DEFAULT_CATEGORY)
			
 
				+        category_counts[category] += 1
			
 
				+
			
 
				+    df = pd.DataFrame(list(category_counts.items()), columns=["Category", "Count"])
			
 
				+    df["Percentage"] = df["Count"] / len(data) * 100
			
 
				+    return df.sort_values("Count", ascending=False)
			
 
				+
			
 
				+
			
 
				+def balance_dataset(data, target_size=25, category_mapping=None):
			
 
				+    category_groups = defaultdict(list)
			
 
				+    for item in data:
			
 
				+        original_category = item["category"]
			
 
				+        mapped_category = original_category
			
 
				+        if category_mapping:
			
 
				+            mapped_category = category_mapping.get(original_category, DEFAULT_CATEGORY)
			
 
				+        category_groups[mapped_category].append(item)
			
 
				+
			
 
				+    print("\nOriginal distribution after category mapping:")
			
 
				+    for cat, items in category_groups.items():
			
 
				+        print(f"{cat}: {len(items)}")
			
 
				+
			
 
				+    # Thanos
			
 
				+    balanced_data = []
			
 
				+    for category, items in category_groups.items():
			
 
				+        if len(items) > target_size:
			
 
				+            sampled_items = random.sample(items, target_size)
			
 
				+            balanced_data.extend(sampled_items)
			
 
				+        else:
			
 
				+            balanced_data.extend(items)
			
 
				+
			
 
				+        if category_mapping:
			
 
				+            for item in balanced_data[-len(items) :]:
			
 
				+                item["category"] = category
			
 
				+
			
 
				+    print(f"\nOriginal dataset size: {len(data)}")
			
 
				+    print(f"Balanced dataset size: {len(balanced_data)}")
			
 
				+    final_dist = analyze_distribution(balanced_data)
			
 
				+    print("\nFinal distribution:")
			
 
				+    print(final_dist)
			
 
				+
			
 
				+    return balanced_data
			
 
				+
			
 
				+
			
 
				+def merge_singleturn_datasets(func_path, json_path, target_per_dataset=150):
			
 
				+    print("\nMerging single-turn datasets...")
			
 
				+    func_single = load_dataset("json", data_files=func_path)
			
 
				+    json_single = load_dataset("json", data_files=json_path)
			
 
				+
			
 
				+    print(f"Original func_single size: {len(func_single['train'])}")
			
 
				+    print(f"Original json_single size: {len(json_single['train'])}")
			
 
				+
			
 
				+    def downsample_and_tag(dataset, source_name, target_total):
			
 
				+        category_groups = defaultdict(list)
			
 
				+        for item in dataset["train"]:
			
 
				+            category_groups[item["category"]].append(item)
			
 
				+
			
 
				+        num_categories = len(category_groups)
			
 
				+        samples_per_category = max(1, math.floor(target_total / num_categories))
			
 
				+
			
 
				+        print(f"\n{source_name}:")
			
 
				+        print(f"Number of categories: {num_categories}")
			
 
				+        print(f"Samples per category: {samples_per_category}")
			
 
				+
			
 
				+        balanced_data = []
			
 
				+        for category, items in category_groups.items():
			
 
				+            if len(items) > samples_per_category:
			
 
				+                sampled_items = random.sample(items, samples_per_category)
			
 
				+                balanced_data.extend(sampled_items)
			
 
				+            else:
			
 
				+                balanced_data.extend(items)
			
 
				+
			
 
				+        for item in balanced_data:
			
 
				+            item["dataset_source"] = source_name
			
 
				+
			
 
				+        return balanced_data
			
 
				+
			
 
				+    func_balanced = downsample_and_tag(
			
 
				+        func_single, "func_calling_singleturn", target_per_dataset
			
 
				+    )
			
 
				+    json_balanced = downsample_and_tag(
			
 
				+        json_single, "json_mode_singleturn", target_per_dataset
			
 
				+    )
			
 
				+
			
 
				+    merged_data = func_balanced + json_balanced
			
 
				+
			
 
				+    print("\nFinal merged dataset statistics:")
			
 
				+    print(f"Total examples: {len(merged_data)}")
			
 
				+    print(f"From func_calling_singleturn: {len(func_balanced)}")
			
 
				+    print(f"From json_mode_singleturn: {len(json_balanced)}")
			
 
				+
			
 
				+    return merged_data
			
 
				+
			
 
				+
			
 
				+def process_dataset(
			
 
				+    input_path, output_path, dataset_type, target_size=25, second_input_path=None
			
 
				+):
			
 
				+    print(f"\nProcessing dataset: {input_path}")
			
 
				+    print(f"Dataset type: {dataset_type}")
			
 
				+
			
 
				+    if dataset_type == "singleturn" and second_input_path:
			
 
				+        data = merge_singleturn_datasets(input_path, second_input_path, target_size)
			
 
				+        balanced_data = data  # Done earlier
			
 
				+    else:
			
 
				+        dataset = load_dataset("json", data_files=input_path)
			
 
				+
			
 
				+    category_mapping = None
			
 
				+    if dataset_type == "agentic":
			
 
				+        category_mapping = AGENTIC_CATEGORY_MAPPING
			
 
				+    elif dataset_type == "glaive":
			
 
				+        category_mapping = GLAIVE_CATEGORY_MAPPING
			
 
				+
			
 
				+    balanced_data = balance_dataset(
			
 
				+        dataset["train"], target_size=target_size, category_mapping=category_mapping
			
 
				+    )
			
 
				+    balanced_dataset = Dataset.from_list(balanced_data)
			
 
				+    dataset_dict = DatasetDict({"train": balanced_dataset})
			
 
				+    dataset_dict.save_to_disk(output_path)
			
 
				+    print(f"\nSaved balanced dataset to {output_path}")
			
 
				+
			
 
				+    return dataset_dict
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    parser = argparse.ArgumentParser(description="Process and balance datasets")
			
 
				+    parser.add_argument("input_path", help="Path to input JSON dataset")
			
 
				+    parser.add_argument("output_path", help="Path to save balanced dataset")
			
 
				+    parser.add_argument(
			
 
				+        "--type",
			
 
				+        choices=["agentic", "func", "singleturn", "glaive"],
			
 
				+        required=True,
			
 
				+        help="Type of dataset to process",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--second-input",
			
 
				+        help="Second input path (required for singleturn merge)",
			
 
				+        default=None,
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--target-size",
			
 
				+        type=int,
			
 
				+        default=25,
			
 
				+        help="Target size per category (default: 25)",
			
 
				+    )
			
 
				+
			
 
				+    args = parser.parse_args()
			
 
				+    process_dataset(args.input_path, args.output_path, args.type, args.target_size)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/end-to-end-use-cases/data-tool/data_prep/scripts/data-prep-toolace.py
+++ b/end-to-end-use-cases/data-tool/data_prep/scripts/data-prep-toolace.py
@@ -0,0 +1,59 @@
 
				+#!/usr/bin/env python3
			
 
				+import argparse
			
 
				+import uuid
			
 
				+
			
 
				+from datasets import Dataset, load_dataset
			
 
				+from tqdm import tqdm
			
 
				+
			
 
				+
			
 
				+def transform_dataset(dataset_name, output_path):
			
 
				+    print(f"Loading dataset: {dataset_name}")
			
 
				+    dataset = load_dataset(dataset_name)
			
 
				+    print(f"Loaded dataset with {len(dataset['train'])} examples")
			
 
				+
			
 
				+    new_data = {"id": [], "conversations": []}
			
 
				+
			
 
				+    print("Transforming dataset...")
			
 
				+    for example in tqdm(dataset["train"], desc="Processing examples"):
			
 
				+        new_data["id"].append(str(uuid.uuid4()))
			
 
				+
			
 
				+        transformed_conv = [{"from": "system", "value": example["system"]}] + example[
			
 
				+            "conversations"
			
 
				+        ]
			
 
				+
			
 
				+        new_data["conversations"].append(transformed_conv)
			
 
				+
			
 
				+    print("Creating new dataset...")
			
 
				+    new_dataset = Dataset.from_dict(new_data)
			
 
				+
			
 
				+    print(f"Saving dataset to: {output_path}")
			
 
				+    new_dataset.save_to_disk(output_path)
			
 
				+
			
 
				+    print(f"Successfully transformed {len(new_dataset)} examples")
			
 
				+    return new_dataset
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    parser = argparse.ArgumentParser(
			
 
				+        description="Transform dataset to conversation format"
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--dataset",
			
 
				+        type=str,
			
 
				+        default="Team-ACE/ToolACE",
			
 
				+        help="HuggingFace dataset name (default: Team-ACE/ToolACE)",
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        "--output",
			
 
				+        type=str,
			
 
				+        default="transformed_dataset",
			
 
				+        help="Output path for transformed dataset (default: transformed_dataset)",
			
 
				+    )
			
 
				+
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    transform_dataset(dataset_name=args.dataset, output_path=args.output)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()