|
@@ -0,0 +1,233 @@
|
|
|
+# python data-prep-nous.py --type singleturn ~/task_datasets/1_Downloaded/hermes-function-calling-v1/func-calling-singleturn.json ~/task_datasets/2_Prepped_for_CoT/hermes-function-calling-v1/ --second-input ~/task_datasets/1_Downloaded/hermes-function-calling-v1/json-mode-agentic.json --target-size 150
|
|
|
+
|
|
|
+# python data-prep-nous.py ~/task_datasets/1_Downloaded/hermes-function-calling-v1/glaive-function-calling-5k.json ~/task_datasets/2_Prepped_for_CoT/hermes-function-calling-v1/glaive-balanced --type glaive --target-size 500
|
|
|
+
|
|
|
+# python data-prep-nous.py ~/task_datasets/1_Downloaded/hermes-function-calling-v1/json-mode-agentic.json ~/task_datasets/2_Prepped_for_CoT/balanced-json-modeagentic --type agentic --target-size 25
|
|
|
+
|
|
|
+# python data-prep-nous.py ~/task_datasets/1_Downloaded/hermes-function-calling-v1/func-calling.json ~/task_datasets/2_Prepped_for_CoT/balanced_func_calling --type func --target-size 25
|
|
|
+
|
|
|
+import argparse
|
|
|
+import math
|
|
|
+import os
|
|
|
+import random
|
|
|
+from collections import defaultdict
|
|
|
+
|
|
|
+import pandas as pd
|
|
|
+from datasets import Dataset, DatasetDict, load_dataset
|
|
|
+
|
|
|
+# Category mappings
|
|
|
+AGENTIC_CATEGORY_MAPPING = {
|
|
|
+ "Simulacrum Agent": "Simulacra Agents",
|
|
|
+ "Simulacra Agent": "Simulacra Agents",
|
|
|
+ "Outlines Agents": "Outlines Agents",
|
|
|
+ "Outlines Agent": "Outlines Agents",
|
|
|
+ "Minecraft Agent": "Minecraft Agents",
|
|
|
+ "Voyager MineCraft Agent": "Minecraft Agents",
|
|
|
+ "Agent Frameworks": "Development Frameworks",
|
|
|
+ "Copilot Frameworks": "Development Frameworks",
|
|
|
+ "AI Analysis Agent": "Utility Agents",
|
|
|
+ "Code Analysis Agent": "Utility Agents",
|
|
|
+ "File Management Agent": "Utility Agents",
|
|
|
+ "Utility Function": "Utility Agents",
|
|
|
+ "WebBrowser Agent": "Utility Agents",
|
|
|
+ "Data Structures": "Data Processing Agents",
|
|
|
+ "Data Structure": "Data Processing Agents",
|
|
|
+ "Data Compression": "Data Processing Agents",
|
|
|
+ "DSPy Agents": "DSPy Agents",
|
|
|
+ "LLM Agents": "LLM Agents",
|
|
|
+ "Instructor Agents": "Instructor Agents",
|
|
|
+ "Autogen Agents": "Autogen Agents",
|
|
|
+ "LlamaIndex Agents": "LlamaIndex Agents",
|
|
|
+ "Langchain Agents": "Langchain Agents",
|
|
|
+}
|
|
|
+
|
|
|
+GLAIVE_CATEGORY_MAPPING = {
|
|
|
+ "Technology": "tech_computing",
|
|
|
+ "Programming Concepts": "tech_computing",
|
|
|
+ "Programming and Computer Science Questions": "tech_computing",
|
|
|
+ "Web Development and Design": "tech_computing",
|
|
|
+ "Database and SQL": "tech_computing",
|
|
|
+ "Swift Programming": "tech_computing",
|
|
|
+ "Cybersecurity and Encryption": "tech_computing",
|
|
|
+ "Data Science": "data_analytics",
|
|
|
+ "Data Analysis and Programming": "data_analytics",
|
|
|
+ "Machine Learning": "data_analytics",
|
|
|
+ "Natural Language Processing": "data_analytics",
|
|
|
+ "Stocks and Orders": "finance_business",
|
|
|
+ "Loan and Financial Calculations": "finance_business",
|
|
|
+ "Finance & Economics": "finance_business",
|
|
|
+ "Business Strategies": "finance_business",
|
|
|
+ "Science Education": "science_education",
|
|
|
+ "Science and Nature Exploration": "science_education",
|
|
|
+ "Quantum Physics": "science_education",
|
|
|
+ "Climate and Environmental Solutions": "science_education",
|
|
|
+ "Flight Services": "services_productivity",
|
|
|
+ "Location Services": "services_productivity",
|
|
|
+ "Productivity": "services_productivity",
|
|
|
+ "Request Management": "services_productivity",
|
|
|
+ "History and Culture": "knowledge_culture",
|
|
|
+ "Book Search": "knowledge_culture",
|
|
|
+ "Literary Analysis": "knowledge_culture",
|
|
|
+ "Language and Linguistics": "knowledge_culture",
|
|
|
+ "Language and Logic": "knowledge_culture",
|
|
|
+}
|
|
|
+
|
|
|
+DEFAULT_CATEGORY = "Other"
|
|
|
+
|
|
|
+
|
|
|
+def analyze_distribution(data, category_mapping):
|
|
|
+ category_counts = defaultdict(int)
|
|
|
+
|
|
|
+ for item in data:
|
|
|
+ category = item["category"]
|
|
|
+ if category_mapping:
|
|
|
+ category = category_mapping.get(category, DEFAULT_CATEGORY)
|
|
|
+ category_counts[category] += 1
|
|
|
+
|
|
|
+ df = pd.DataFrame(list(category_counts.items()), columns=["Category", "Count"])
|
|
|
+ df["Percentage"] = df["Count"] / len(data) * 100
|
|
|
+ return df.sort_values("Count", ascending=False)
|
|
|
+
|
|
|
+
|
|
|
+def balance_dataset(data, target_size=25, category_mapping=None):
|
|
|
+ category_groups = defaultdict(list)
|
|
|
+ for item in data:
|
|
|
+ original_category = item["category"]
|
|
|
+ mapped_category = original_category
|
|
|
+ if category_mapping:
|
|
|
+ mapped_category = category_mapping.get(original_category, DEFAULT_CATEGORY)
|
|
|
+ category_groups[mapped_category].append(item)
|
|
|
+
|
|
|
+ print("\nOriginal distribution after category mapping:")
|
|
|
+ for cat, items in category_groups.items():
|
|
|
+ print(f"{cat}: {len(items)}")
|
|
|
+
|
|
|
+ # Thanos
|
|
|
+ balanced_data = []
|
|
|
+ for category, items in category_groups.items():
|
|
|
+ if len(items) > target_size:
|
|
|
+ sampled_items = random.sample(items, target_size)
|
|
|
+ balanced_data.extend(sampled_items)
|
|
|
+ else:
|
|
|
+ balanced_data.extend(items)
|
|
|
+
|
|
|
+ if category_mapping:
|
|
|
+ for item in balanced_data[-len(items) :]:
|
|
|
+ item["category"] = category
|
|
|
+
|
|
|
+ print(f"\nOriginal dataset size: {len(data)}")
|
|
|
+ print(f"Balanced dataset size: {len(balanced_data)}")
|
|
|
+ final_dist = analyze_distribution(balanced_data)
|
|
|
+ print("\nFinal distribution:")
|
|
|
+ print(final_dist)
|
|
|
+
|
|
|
+ return balanced_data
|
|
|
+
|
|
|
+
|
|
|
+def merge_singleturn_datasets(func_path, json_path, target_per_dataset=150):
|
|
|
+ print("\nMerging single-turn datasets...")
|
|
|
+ func_single = load_dataset("json", data_files=func_path)
|
|
|
+ json_single = load_dataset("json", data_files=json_path)
|
|
|
+
|
|
|
+ print(f"Original func_single size: {len(func_single['train'])}")
|
|
|
+ print(f"Original json_single size: {len(json_single['train'])}")
|
|
|
+
|
|
|
+ def downsample_and_tag(dataset, source_name, target_total):
|
|
|
+ category_groups = defaultdict(list)
|
|
|
+ for item in dataset["train"]:
|
|
|
+ category_groups[item["category"]].append(item)
|
|
|
+
|
|
|
+ num_categories = len(category_groups)
|
|
|
+ samples_per_category = max(1, math.floor(target_total / num_categories))
|
|
|
+
|
|
|
+ print(f"\n{source_name}:")
|
|
|
+ print(f"Number of categories: {num_categories}")
|
|
|
+ print(f"Samples per category: {samples_per_category}")
|
|
|
+
|
|
|
+ balanced_data = []
|
|
|
+ for category, items in category_groups.items():
|
|
|
+ if len(items) > samples_per_category:
|
|
|
+ sampled_items = random.sample(items, samples_per_category)
|
|
|
+ balanced_data.extend(sampled_items)
|
|
|
+ else:
|
|
|
+ balanced_data.extend(items)
|
|
|
+
|
|
|
+ for item in balanced_data:
|
|
|
+ item["dataset_source"] = source_name
|
|
|
+
|
|
|
+ return balanced_data
|
|
|
+
|
|
|
+ func_balanced = downsample_and_tag(
|
|
|
+ func_single, "func_calling_singleturn", target_per_dataset
|
|
|
+ )
|
|
|
+ json_balanced = downsample_and_tag(
|
|
|
+ json_single, "json_mode_singleturn", target_per_dataset
|
|
|
+ )
|
|
|
+
|
|
|
+ merged_data = func_balanced + json_balanced
|
|
|
+
|
|
|
+ print("\nFinal merged dataset statistics:")
|
|
|
+ print(f"Total examples: {len(merged_data)}")
|
|
|
+ print(f"From func_calling_singleturn: {len(func_balanced)}")
|
|
|
+ print(f"From json_mode_singleturn: {len(json_balanced)}")
|
|
|
+
|
|
|
+ return merged_data
|
|
|
+
|
|
|
+
|
|
|
+def process_dataset(
|
|
|
+ input_path, output_path, dataset_type, target_size=25, second_input_path=None
|
|
|
+):
|
|
|
+ print(f"\nProcessing dataset: {input_path}")
|
|
|
+ print(f"Dataset type: {dataset_type}")
|
|
|
+
|
|
|
+ if dataset_type == "singleturn" and second_input_path:
|
|
|
+ data = merge_singleturn_datasets(input_path, second_input_path, target_size)
|
|
|
+ balanced_data = data # Done earlier
|
|
|
+ else:
|
|
|
+ dataset = load_dataset("json", data_files=input_path)
|
|
|
+
|
|
|
+ category_mapping = None
|
|
|
+ if dataset_type == "agentic":
|
|
|
+ category_mapping = AGENTIC_CATEGORY_MAPPING
|
|
|
+ elif dataset_type == "glaive":
|
|
|
+ category_mapping = GLAIVE_CATEGORY_MAPPING
|
|
|
+
|
|
|
+ balanced_data = balance_dataset(
|
|
|
+ dataset["train"], target_size=target_size, category_mapping=category_mapping
|
|
|
+ )
|
|
|
+ balanced_dataset = Dataset.from_list(balanced_data)
|
|
|
+ dataset_dict = DatasetDict({"train": balanced_dataset})
|
|
|
+ dataset_dict.save_to_disk(output_path)
|
|
|
+ print(f"\nSaved balanced dataset to {output_path}")
|
|
|
+
|
|
|
+ return dataset_dict
|
|
|
+
|
|
|
+
|
|
|
+def main():
|
|
|
+ parser = argparse.ArgumentParser(description="Process and balance datasets")
|
|
|
+ parser.add_argument("input_path", help="Path to input JSON dataset")
|
|
|
+ parser.add_argument("output_path", help="Path to save balanced dataset")
|
|
|
+ parser.add_argument(
|
|
|
+ "--type",
|
|
|
+ choices=["agentic", "func", "singleturn", "glaive"],
|
|
|
+ required=True,
|
|
|
+ help="Type of dataset to process",
|
|
|
+ )
|
|
|
+ parser.add_argument(
|
|
|
+ "--second-input",
|
|
|
+ help="Second input path (required for singleturn merge)",
|
|
|
+ default=None,
|
|
|
+ )
|
|
|
+ parser.add_argument(
|
|
|
+ "--target-size",
|
|
|
+ type=int,
|
|
|
+ default=25,
|
|
|
+ help="Target size per category (default: 25)",
|
|
|
+ )
|
|
|
+
|
|
|
+ args = parser.parse_args()
|
|
|
+ process_dataset(args.input_path, args.output_path, args.type, args.target_size)
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ main()
|