data-prep-nous.py 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234
  1. # python data-prep-nous.py --type singleturn ~/task_datasets/1_Downloaded/hermes-function-calling-v1/func-calling-singleturn.json ~/task_datasets/2_Prepped_for_CoT/hermes-function-calling-v1/ --second-input ~/task_datasets/1_Downloaded/hermes-function-calling-v1/json-mode-agentic.json --target-size 150
  2. # python data-prep-nous.py ~/task_datasets/1_Downloaded/hermes-function-calling-v1/glaive-function-calling-5k.json ~/task_datasets/2_Prepped_for_CoT/hermes-function-calling-v1/glaive-balanced --type glaive --target-size 500
  3. # python data-prep-nous.py ~/task_datasets/1_Downloaded/hermes-function-calling-v1/json-mode-agentic.json ~/task_datasets/2_Prepped_for_CoT/balanced-json-modeagentic --type agentic --target-size 25
  4. # python data-prep-nous.py ~/task_datasets/1_Downloaded/hermes-function-calling-v1/func-calling.json ~/task_datasets/2_Prepped_for_CoT/balanced_func_calling --type func --target-size 25
  5. import argparse
  6. import math
  7. import os
  8. import random
  9. from collections import defaultdict
  10. import pandas as pd
  11. from datasets import Dataset, DatasetDict, load_dataset
  12. # Category mappings
  13. AGENTIC_CATEGORY_MAPPING = {
  14. "Simulacrum Agent": "Simulacra Agents",
  15. "Simulacra Agent": "Simulacra Agents",
  16. "Outlines Agents": "Outlines Agents",
  17. "Outlines Agent": "Outlines Agents",
  18. "Minecraft Agent": "Minecraft Agents",
  19. "Voyager MineCraft Agent": "Minecraft Agents",
  20. "Agent Frameworks": "Development Frameworks",
  21. "Copilot Frameworks": "Development Frameworks",
  22. "AI Analysis Agent": "Utility Agents",
  23. "Code Analysis Agent": "Utility Agents",
  24. "File Management Agent": "Utility Agents",
  25. "Utility Function": "Utility Agents",
  26. "WebBrowser Agent": "Utility Agents",
  27. "Data Structures": "Data Processing Agents",
  28. "Data Structure": "Data Processing Agents",
  29. "Data Compression": "Data Processing Agents",
  30. "DSPy Agents": "DSPy Agents",
  31. "LLM Agents": "LLM Agents",
  32. "Instructor Agents": "Instructor Agents",
  33. "Autogen Agents": "Autogen Agents",
  34. "LlamaIndex Agents": "LlamaIndex Agents",
  35. "Langchain Agents": "Langchain Agents",
  36. }
  37. GLAIVE_CATEGORY_MAPPING = {
  38. "Technology": "tech_computing",
  39. "Programming Concepts": "tech_computing",
  40. "Programming and Computer Science Questions": "tech_computing",
  41. "Web Development and Design": "tech_computing",
  42. "Database and SQL": "tech_computing",
  43. "Swift Programming": "tech_computing",
  44. "Cybersecurity and Encryption": "tech_computing",
  45. "Data Science": "data_analytics",
  46. "Data Analysis and Programming": "data_analytics",
  47. "Machine Learning": "data_analytics",
  48. "Natural Language Processing": "data_analytics",
  49. "Stocks and Orders": "finance_business",
  50. "Loan and Financial Calculations": "finance_business",
  51. "Finance & Economics": "finance_business",
  52. "Business Strategies": "finance_business",
  53. "Science Education": "science_education",
  54. "Science and Nature Exploration": "science_education",
  55. "Quantum Physics": "science_education",
  56. "Climate and Environmental Solutions": "science_education",
  57. "Flight Services": "services_productivity",
  58. "Location Services": "services_productivity",
  59. "Productivity": "services_productivity",
  60. "Request Management": "services_productivity",
  61. "History and Culture": "knowledge_culture",
  62. "Book Search": "knowledge_culture",
  63. "Literary Analysis": "knowledge_culture",
  64. "Language and Linguistics": "knowledge_culture",
  65. "Language and Logic": "knowledge_culture",
  66. }
  67. DEFAULT_CATEGORY = "Other"
  68. def analyze_distribution(data, category_mapping):
  69. category_counts = defaultdict(int)
  70. for item in data:
  71. category = item["category"]
  72. if category_mapping:
  73. category = category_mapping.get(category, DEFAULT_CATEGORY)
  74. category_counts[category] += 1
  75. df = pd.DataFrame(list(category_counts.items()), columns=["Category", "Count"])
  76. df["Percentage"] = df["Count"] / len(data) * 100
  77. return df.sort_values("Count", ascending=False)
  78. def balance_dataset(data, target_size=25, category_mapping=None):
  79. category_groups = defaultdict(list)
  80. for item in data:
  81. original_category = item["category"]
  82. mapped_category = original_category
  83. if category_mapping:
  84. mapped_category = category_mapping.get(original_category, DEFAULT_CATEGORY)
  85. category_groups[mapped_category].append(item)
  86. print("\nOriginal distribution after category mapping:")
  87. for cat, items in category_groups.items():
  88. print(f"{cat}: {len(items)}")
  89. # Thanos
  90. balanced_data = []
  91. for category, items in category_groups.items():
  92. if len(items) > target_size:
  93. sampled_items = random.sample(items, target_size)
  94. balanced_data.extend(sampled_items)
  95. else:
  96. balanced_data.extend(items)
  97. if category_mapping:
  98. for item in balanced_data[-len(items) :]:
  99. item["category"] = category
  100. print(f"\nOriginal dataset size: {len(data)}")
  101. print(f"Balanced dataset size: {len(balanced_data)}")
  102. final_dist = analyze_distribution(balanced_data)
  103. print("\nFinal distribution:")
  104. print(final_dist)
  105. return balanced_data
  106. def merge_singleturn_datasets(func_path, json_path, target_per_dataset=150):
  107. print("\nMerging single-turn datasets...")
  108. func_single = load_dataset("json", data_files=func_path)
  109. json_single = load_dataset("json", data_files=json_path)
  110. print(f"Original func_single size: {len(func_single['train'])}")
  111. print(f"Original json_single size: {len(json_single['train'])}")
  112. def downsample_and_tag(dataset, source_name, target_total):
  113. category_groups = defaultdict(list)
  114. for item in dataset["train"]:
  115. category_groups[item["category"]].append(item)
  116. num_categories = len(category_groups)
  117. samples_per_category = max(1, math.floor(target_total / num_categories))
  118. print(f"\n{source_name}:")
  119. print(f"Number of categories: {num_categories}")
  120. print(f"Samples per category: {samples_per_category}")
  121. balanced_data = []
  122. for category, items in category_groups.items():
  123. if len(items) > samples_per_category:
  124. sampled_items = random.sample(items, samples_per_category)
  125. balanced_data.extend(sampled_items)
  126. else:
  127. balanced_data.extend(items)
  128. for item in balanced_data:
  129. item["dataset_source"] = source_name
  130. return balanced_data
  131. func_balanced = downsample_and_tag(
  132. func_single, "func_calling_singleturn", target_per_dataset
  133. )
  134. json_balanced = downsample_and_tag(
  135. json_single, "json_mode_singleturn", target_per_dataset
  136. )
  137. merged_data = func_balanced + json_balanced
  138. print("\nFinal merged dataset statistics:")
  139. print(f"Total examples: {len(merged_data)}")
  140. print(f"From func_calling_singleturn: {len(func_balanced)}")
  141. print(f"From json_mode_singleturn: {len(json_balanced)}")
  142. return merged_data
  143. def process_dataset(
  144. input_path, output_path, dataset_type, target_size=25, second_input_path=None
  145. ):
  146. print(f"\nProcessing dataset: {input_path}")
  147. print(f"Dataset type: {dataset_type}")
  148. if dataset_type == "singleturn" and second_input_path:
  149. data = merge_singleturn_datasets(input_path, second_input_path, target_size)
  150. balanced_data = data # Done earlier
  151. else:
  152. dataset = load_dataset("json", data_files=input_path)
  153. category_mapping = None
  154. if dataset_type == "agentic":
  155. category_mapping = AGENTIC_CATEGORY_MAPPING
  156. elif dataset_type == "glaive":
  157. category_mapping = GLAIVE_CATEGORY_MAPPING
  158. balanced_data = balance_dataset(
  159. dataset["train"], target_size=target_size, category_mapping=category_mapping
  160. )
  161. balanced_dataset = Dataset.from_list(balanced_data)
  162. dataset_dict = DatasetDict({"train": balanced_dataset})
  163. dataset_dict.save_to_disk(output_path)
  164. print(f"\nSaved balanced dataset to {output_path}")
  165. return dataset_dict
  166. def main():
  167. parser = argparse.ArgumentParser(description="Process and balance datasets")
  168. parser.add_argument("input_path", help="Path to input JSON dataset")
  169. parser.add_argument("output_path", help="Path to save balanced dataset")
  170. parser.add_argument(
  171. "--type",
  172. choices=["agentic", "func", "singleturn", "glaive"],
  173. required=True,
  174. help="Type of dataset to process",
  175. )
  176. parser.add_argument(
  177. "--second-input",
  178. help="Second input path (required for singleturn merge)",
  179. default=None,
  180. )
  181. parser.add_argument(
  182. "--target-size",
  183. type=int,
  184. default=25,
  185. help="Target size per category (default: 25)",
  186. )
  187. args = parser.parse_args()
  188. process_dataset(args.input_path, args.output_path, args.type, args.target_size)
  189. if __name__ == "__main__":
  190. main()