Browse Source

add prep scripts

Sanyam Bhutani 1 month ago
parent
commit
5c24996241

+ 13 - 0
end-to-end-use-cases/data-tool/ReadMe.MD

@@ -0,0 +1,13 @@
+## WIP
+
+The end goal for this effort is to serve as fine-tuning data preparation kit.
+
+## Current status:
+
+Currently, I'm (WIP) evaluating the idea to improve tool-calling datasets. 
+
+Setup:
+- configs: Has the config prompts for creating synthetic data using `3.3`
+- data_prep/scripts: This is what you would like to run to prepare your datasets for annotation
+- scripts/annotation-inference: Script for generating synthetic datasets -> Use the vllm script for inference
+- fine-tuning: configs for FT using TorchTune

+ 233 - 0
end-to-end-use-cases/data-tool/data_prep/scripts/data-prep-nous.py

@@ -0,0 +1,233 @@
+# python data-prep-nous.py --type singleturn ~/task_datasets/1_Downloaded/hermes-function-calling-v1/func-calling-singleturn.json ~/task_datasets/2_Prepped_for_CoT/hermes-function-calling-v1/  --second-input ~/task_datasets/1_Downloaded/hermes-function-calling-v1/json-mode-agentic.json --target-size 150
+
+# python data-prep-nous.py ~/task_datasets/1_Downloaded/hermes-function-calling-v1/glaive-function-calling-5k.json ~/task_datasets/2_Prepped_for_CoT/hermes-function-calling-v1/glaive-balanced --type glaive --target-size 500
+
+# python data-prep-nous.py ~/task_datasets/1_Downloaded/hermes-function-calling-v1/json-mode-agentic.json ~/task_datasets/2_Prepped_for_CoT/balanced-json-modeagentic --type agentic --target-size 25
+
+# python data-prep-nous.py ~/task_datasets/1_Downloaded/hermes-function-calling-v1/func-calling.json ~/task_datasets/2_Prepped_for_CoT/balanced_func_calling --type func --target-size 25
+
+import argparse
+import math
+import os
+import random
+from collections import defaultdict
+
+import pandas as pd
+from datasets import Dataset, DatasetDict, load_dataset
+
+# Category mappings
+AGENTIC_CATEGORY_MAPPING = {
+    "Simulacrum Agent": "Simulacra Agents",
+    "Simulacra Agent": "Simulacra Agents",
+    "Outlines Agents": "Outlines Agents",
+    "Outlines Agent": "Outlines Agents",
+    "Minecraft Agent": "Minecraft Agents",
+    "Voyager MineCraft Agent": "Minecraft Agents",
+    "Agent Frameworks": "Development Frameworks",
+    "Copilot Frameworks": "Development Frameworks",
+    "AI Analysis Agent": "Utility Agents",
+    "Code Analysis Agent": "Utility Agents",
+    "File Management Agent": "Utility Agents",
+    "Utility Function": "Utility Agents",
+    "WebBrowser Agent": "Utility Agents",
+    "Data Structures": "Data Processing Agents",
+    "Data Structure": "Data Processing Agents",
+    "Data Compression": "Data Processing Agents",
+    "DSPy Agents": "DSPy Agents",
+    "LLM Agents": "LLM Agents",
+    "Instructor Agents": "Instructor Agents",
+    "Autogen Agents": "Autogen Agents",
+    "LlamaIndex Agents": "LlamaIndex Agents",
+    "Langchain Agents": "Langchain Agents",
+}
+
+GLAIVE_CATEGORY_MAPPING = {
+    "Technology": "tech_computing",
+    "Programming Concepts": "tech_computing",
+    "Programming and Computer Science Questions": "tech_computing",
+    "Web Development and Design": "tech_computing",
+    "Database and SQL": "tech_computing",
+    "Swift Programming": "tech_computing",
+    "Cybersecurity and Encryption": "tech_computing",
+    "Data Science": "data_analytics",
+    "Data Analysis and Programming": "data_analytics",
+    "Machine Learning": "data_analytics",
+    "Natural Language Processing": "data_analytics",
+    "Stocks and Orders": "finance_business",
+    "Loan and Financial Calculations": "finance_business",
+    "Finance & Economics": "finance_business",
+    "Business Strategies": "finance_business",
+    "Science Education": "science_education",
+    "Science and Nature Exploration": "science_education",
+    "Quantum Physics": "science_education",
+    "Climate and Environmental Solutions": "science_education",
+    "Flight Services": "services_productivity",
+    "Location Services": "services_productivity",
+    "Productivity": "services_productivity",
+    "Request Management": "services_productivity",
+    "History and Culture": "knowledge_culture",
+    "Book Search": "knowledge_culture",
+    "Literary Analysis": "knowledge_culture",
+    "Language and Linguistics": "knowledge_culture",
+    "Language and Logic": "knowledge_culture",
+}
+
+DEFAULT_CATEGORY = "Other"
+
+
+def analyze_distribution(data, category_mapping):
+    category_counts = defaultdict(int)
+
+    for item in data:
+        category = item["category"]
+        if category_mapping:
+            category = category_mapping.get(category, DEFAULT_CATEGORY)
+        category_counts[category] += 1
+
+    df = pd.DataFrame(list(category_counts.items()), columns=["Category", "Count"])
+    df["Percentage"] = df["Count"] / len(data) * 100
+    return df.sort_values("Count", ascending=False)
+
+
+def balance_dataset(data, target_size=25, category_mapping=None):
+    category_groups = defaultdict(list)
+    for item in data:
+        original_category = item["category"]
+        mapped_category = original_category
+        if category_mapping:
+            mapped_category = category_mapping.get(original_category, DEFAULT_CATEGORY)
+        category_groups[mapped_category].append(item)
+
+    print("\nOriginal distribution after category mapping:")
+    for cat, items in category_groups.items():
+        print(f"{cat}: {len(items)}")
+
+    # Thanos
+    balanced_data = []
+    for category, items in category_groups.items():
+        if len(items) > target_size:
+            sampled_items = random.sample(items, target_size)
+            balanced_data.extend(sampled_items)
+        else:
+            balanced_data.extend(items)
+
+        if category_mapping:
+            for item in balanced_data[-len(items) :]:
+                item["category"] = category
+
+    print(f"\nOriginal dataset size: {len(data)}")
+    print(f"Balanced dataset size: {len(balanced_data)}")
+    final_dist = analyze_distribution(balanced_data)
+    print("\nFinal distribution:")
+    print(final_dist)
+
+    return balanced_data
+
+
+def merge_singleturn_datasets(func_path, json_path, target_per_dataset=150):
+    print("\nMerging single-turn datasets...")
+    func_single = load_dataset("json", data_files=func_path)
+    json_single = load_dataset("json", data_files=json_path)
+
+    print(f"Original func_single size: {len(func_single['train'])}")
+    print(f"Original json_single size: {len(json_single['train'])}")
+
+    def downsample_and_tag(dataset, source_name, target_total):
+        category_groups = defaultdict(list)
+        for item in dataset["train"]:
+            category_groups[item["category"]].append(item)
+
+        num_categories = len(category_groups)
+        samples_per_category = max(1, math.floor(target_total / num_categories))
+
+        print(f"\n{source_name}:")
+        print(f"Number of categories: {num_categories}")
+        print(f"Samples per category: {samples_per_category}")
+
+        balanced_data = []
+        for category, items in category_groups.items():
+            if len(items) > samples_per_category:
+                sampled_items = random.sample(items, samples_per_category)
+                balanced_data.extend(sampled_items)
+            else:
+                balanced_data.extend(items)
+
+        for item in balanced_data:
+            item["dataset_source"] = source_name
+
+        return balanced_data
+
+    func_balanced = downsample_and_tag(
+        func_single, "func_calling_singleturn", target_per_dataset
+    )
+    json_balanced = downsample_and_tag(
+        json_single, "json_mode_singleturn", target_per_dataset
+    )
+
+    merged_data = func_balanced + json_balanced
+
+    print("\nFinal merged dataset statistics:")
+    print(f"Total examples: {len(merged_data)}")
+    print(f"From func_calling_singleturn: {len(func_balanced)}")
+    print(f"From json_mode_singleturn: {len(json_balanced)}")
+
+    return merged_data
+
+
+def process_dataset(
+    input_path, output_path, dataset_type, target_size=25, second_input_path=None
+):
+    print(f"\nProcessing dataset: {input_path}")
+    print(f"Dataset type: {dataset_type}")
+
+    if dataset_type == "singleturn" and second_input_path:
+        data = merge_singleturn_datasets(input_path, second_input_path, target_size)
+        balanced_data = data  # Done earlier
+    else:
+        dataset = load_dataset("json", data_files=input_path)
+
+    category_mapping = None
+    if dataset_type == "agentic":
+        category_mapping = AGENTIC_CATEGORY_MAPPING
+    elif dataset_type == "glaive":
+        category_mapping = GLAIVE_CATEGORY_MAPPING
+
+    balanced_data = balance_dataset(
+        dataset["train"], target_size=target_size, category_mapping=category_mapping
+    )
+    balanced_dataset = Dataset.from_list(balanced_data)
+    dataset_dict = DatasetDict({"train": balanced_dataset})
+    dataset_dict.save_to_disk(output_path)
+    print(f"\nSaved balanced dataset to {output_path}")
+
+    return dataset_dict
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Process and balance datasets")
+    parser.add_argument("input_path", help="Path to input JSON dataset")
+    parser.add_argument("output_path", help="Path to save balanced dataset")
+    parser.add_argument(
+        "--type",
+        choices=["agentic", "func", "singleturn", "glaive"],
+        required=True,
+        help="Type of dataset to process",
+    )
+    parser.add_argument(
+        "--second-input",
+        help="Second input path (required for singleturn merge)",
+        default=None,
+    )
+    parser.add_argument(
+        "--target-size",
+        type=int,
+        default=25,
+        help="Target size per category (default: 25)",
+    )
+
+    args = parser.parse_args()
+    process_dataset(args.input_path, args.output_path, args.type, args.target_size)
+
+
+if __name__ == "__main__":
+    main()

+ 59 - 0
end-to-end-use-cases/data-tool/data_prep/scripts/data-prep-toolace.py

@@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+import argparse
+import uuid
+
+from datasets import Dataset, load_dataset
+from tqdm import tqdm
+
+
+def transform_dataset(dataset_name, output_path):
+    print(f"Loading dataset: {dataset_name}")
+    dataset = load_dataset(dataset_name)
+    print(f"Loaded dataset with {len(dataset['train'])} examples")
+
+    new_data = {"id": [], "conversations": []}
+
+    print("Transforming dataset...")
+    for example in tqdm(dataset["train"], desc="Processing examples"):
+        new_data["id"].append(str(uuid.uuid4()))
+
+        transformed_conv = [{"from": "system", "value": example["system"]}] + example[
+            "conversations"
+        ]
+
+        new_data["conversations"].append(transformed_conv)
+
+    print("Creating new dataset...")
+    new_dataset = Dataset.from_dict(new_data)
+
+    print(f"Saving dataset to: {output_path}")
+    new_dataset.save_to_disk(output_path)
+
+    print(f"Successfully transformed {len(new_dataset)} examples")
+    return new_dataset
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Transform dataset to conversation format"
+    )
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default="Team-ACE/ToolACE",
+        help="HuggingFace dataset name (default: Team-ACE/ToolACE)",
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        default="transformed_dataset",
+        help="Output path for transformed dataset (default: transformed_dataset)",
+    )
+
+    args = parser.parse_args()
+
+    transform_dataset(dataset_name=args.dataset, output_path=args.output)
+
+
+if __name__ == "__main__":
+    main()