123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234 |
- # python data-prep-nous.py --type singleturn ~/task_datasets/1_Downloaded/hermes-function-calling-v1/func-calling-singleturn.json ~/task_datasets/2_Prepped_for_CoT/hermes-function-calling-v1/ --second-input ~/task_datasets/1_Downloaded/hermes-function-calling-v1/json-mode-agentic.json --target-size 150
- # python data-prep-nous.py ~/task_datasets/1_Downloaded/hermes-function-calling-v1/glaive-function-calling-5k.json ~/task_datasets/2_Prepped_for_CoT/hermes-function-calling-v1/glaive-balanced --type glaive --target-size 500
- # python data-prep-nous.py ~/task_datasets/1_Downloaded/hermes-function-calling-v1/json-mode-agentic.json ~/task_datasets/2_Prepped_for_CoT/balanced-json-modeagentic --type agentic --target-size 25
- # python data-prep-nous.py ~/task_datasets/1_Downloaded/hermes-function-calling-v1/func-calling.json ~/task_datasets/2_Prepped_for_CoT/balanced_func_calling --type func --target-size 25
- import argparse
- import math
- import os
- import random
- from collections import defaultdict
- import pandas as pd
- from datasets import Dataset, DatasetDict, load_dataset
- # Category mappings
- AGENTIC_CATEGORY_MAPPING = {
- "Simulacrum Agent": "Simulacra Agents",
- "Simulacra Agent": "Simulacra Agents",
- "Outlines Agents": "Outlines Agents",
- "Outlines Agent": "Outlines Agents",
- "Minecraft Agent": "Minecraft Agents",
- "Voyager MineCraft Agent": "Minecraft Agents",
- "Agent Frameworks": "Development Frameworks",
- "Copilot Frameworks": "Development Frameworks",
- "AI Analysis Agent": "Utility Agents",
- "Code Analysis Agent": "Utility Agents",
- "File Management Agent": "Utility Agents",
- "Utility Function": "Utility Agents",
- "WebBrowser Agent": "Utility Agents",
- "Data Structures": "Data Processing Agents",
- "Data Structure": "Data Processing Agents",
- "Data Compression": "Data Processing Agents",
- "DSPy Agents": "DSPy Agents",
- "LLM Agents": "LLM Agents",
- "Instructor Agents": "Instructor Agents",
- "Autogen Agents": "Autogen Agents",
- "LlamaIndex Agents": "LlamaIndex Agents",
- "Langchain Agents": "Langchain Agents",
- }
- GLAIVE_CATEGORY_MAPPING = {
- "Technology": "tech_computing",
- "Programming Concepts": "tech_computing",
- "Programming and Computer Science Questions": "tech_computing",
- "Web Development and Design": "tech_computing",
- "Database and SQL": "tech_computing",
- "Swift Programming": "tech_computing",
- "Cybersecurity and Encryption": "tech_computing",
- "Data Science": "data_analytics",
- "Data Analysis and Programming": "data_analytics",
- "Machine Learning": "data_analytics",
- "Natural Language Processing": "data_analytics",
- "Stocks and Orders": "finance_business",
- "Loan and Financial Calculations": "finance_business",
- "Finance & Economics": "finance_business",
- "Business Strategies": "finance_business",
- "Science Education": "science_education",
- "Science and Nature Exploration": "science_education",
- "Quantum Physics": "science_education",
- "Climate and Environmental Solutions": "science_education",
- "Flight Services": "services_productivity",
- "Location Services": "services_productivity",
- "Productivity": "services_productivity",
- "Request Management": "services_productivity",
- "History and Culture": "knowledge_culture",
- "Book Search": "knowledge_culture",
- "Literary Analysis": "knowledge_culture",
- "Language and Linguistics": "knowledge_culture",
- "Language and Logic": "knowledge_culture",
- }
- DEFAULT_CATEGORY = "Other"
- def analyze_distribution(data, category_mapping):
- category_counts = defaultdict(int)
- for item in data:
- category = item["category"]
- if category_mapping:
- category = category_mapping.get(category, DEFAULT_CATEGORY)
- category_counts[category] += 1
- df = pd.DataFrame(list(category_counts.items()), columns=["Category", "Count"])
- df["Percentage"] = df["Count"] / len(data) * 100
- return df.sort_values("Count", ascending=False)
- def balance_dataset(data, target_size=25, category_mapping=None):
- category_groups = defaultdict(list)
- for item in data:
- original_category = item["category"]
- mapped_category = original_category
- if category_mapping:
- mapped_category = category_mapping.get(original_category, DEFAULT_CATEGORY)
- category_groups[mapped_category].append(item)
- print("\nOriginal distribution after category mapping:")
- for cat, items in category_groups.items():
- print(f"{cat}: {len(items)}")
- # Thanos
- balanced_data = []
- for category, items in category_groups.items():
- if len(items) > target_size:
- sampled_items = random.sample(items, target_size)
- balanced_data.extend(sampled_items)
- else:
- balanced_data.extend(items)
- if category_mapping:
- for item in balanced_data[-len(items) :]:
- item["category"] = category
- print(f"\nOriginal dataset size: {len(data)}")
- print(f"Balanced dataset size: {len(balanced_data)}")
- final_dist = analyze_distribution(balanced_data)
- print("\nFinal distribution:")
- print(final_dist)
- return balanced_data
- def merge_singleturn_datasets(func_path, json_path, target_per_dataset=150):
- print("\nMerging single-turn datasets...")
- func_single = load_dataset("json", data_files=func_path)
- json_single = load_dataset("json", data_files=json_path)
- print(f"Original func_single size: {len(func_single['train'])}")
- print(f"Original json_single size: {len(json_single['train'])}")
- def downsample_and_tag(dataset, source_name, target_total):
- category_groups = defaultdict(list)
- for item in dataset["train"]:
- category_groups[item["category"]].append(item)
- num_categories = len(category_groups)
- samples_per_category = max(1, math.floor(target_total / num_categories))
- print(f"\n{source_name}:")
- print(f"Number of categories: {num_categories}")
- print(f"Samples per category: {samples_per_category}")
- balanced_data = []
- for category, items in category_groups.items():
- if len(items) > samples_per_category:
- sampled_items = random.sample(items, samples_per_category)
- balanced_data.extend(sampled_items)
- else:
- balanced_data.extend(items)
- for item in balanced_data:
- item["dataset_source"] = source_name
- return balanced_data
- func_balanced = downsample_and_tag(
- func_single, "func_calling_singleturn", target_per_dataset
- )
- json_balanced = downsample_and_tag(
- json_single, "json_mode_singleturn", target_per_dataset
- )
- merged_data = func_balanced + json_balanced
- print("\nFinal merged dataset statistics:")
- print(f"Total examples: {len(merged_data)}")
- print(f"From func_calling_singleturn: {len(func_balanced)}")
- print(f"From json_mode_singleturn: {len(json_balanced)}")
- return merged_data
- def process_dataset(
- input_path, output_path, dataset_type, target_size=25, second_input_path=None
- ):
- print(f"\nProcessing dataset: {input_path}")
- print(f"Dataset type: {dataset_type}")
- if dataset_type == "singleturn" and second_input_path:
- data = merge_singleturn_datasets(input_path, second_input_path, target_size)
- balanced_data = data # Done earlier
- else:
- dataset = load_dataset("json", data_files=input_path)
- category_mapping = None
- if dataset_type == "agentic":
- category_mapping = AGENTIC_CATEGORY_MAPPING
- elif dataset_type == "glaive":
- category_mapping = GLAIVE_CATEGORY_MAPPING
- balanced_data = balance_dataset(
- dataset["train"], target_size=target_size, category_mapping=category_mapping
- )
- balanced_dataset = Dataset.from_list(balanced_data)
- dataset_dict = DatasetDict({"train": balanced_dataset})
- dataset_dict.save_to_disk(output_path)
- print(f"\nSaved balanced dataset to {output_path}")
- return dataset_dict
- def main():
- parser = argparse.ArgumentParser(description="Process and balance datasets")
- parser.add_argument("input_path", help="Path to input JSON dataset")
- parser.add_argument("output_path", help="Path to save balanced dataset")
- parser.add_argument(
- "--type",
- choices=["agentic", "func", "singleturn", "glaive"],
- required=True,
- help="Type of dataset to process",
- )
- parser.add_argument(
- "--second-input",
- help="Second input path (required for singleturn merge)",
- default=None,
- )
- parser.add_argument(
- "--target-size",
- type=int,
- default=25,
- help="Target size per category (default: 25)",
- )
- args = parser.parse_args()
- process_dataset(args.input_path, args.output_path, args.type, args.target_size)
- if __name__ == "__main__":
- main()
|