| 
					
				 | 
			
			
				@@ -2,25 +2,6 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  "cells": [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "cell_type": "code", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-   "execution_count": 14, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-   "id": "96a4cb34-d8da-4b16-a929-98bfae2ac668", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-   "metadata": {}, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-   "outputs": [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-     "name": "stdout", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-     "output_type": "stream", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-     "text": [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "Updated Git hooks.\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      "Git LFS initialized.\n" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-     ] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    } 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-   ], 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-   "source": [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "!git lfs install" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-   ] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-  }, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-  { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-   "cell_type": "code", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "execution_count": 15, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "id": "0978b512-1a88-447e-b80a-ee4b72152038", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "metadata": {}, 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -46,13 +27,12 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   }, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "cell_type": "code", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-   "execution_count": 63, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "execution_count": null, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "id": "ec662394-2ae9-4081-87dd-bf75c4f76500", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "metadata": {}, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "outputs": [], 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "source": [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "def analyze_distribution(data: List[Dict]) -> Tuple[Dict, Dict]:\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "    \"\"\"Analyze category and subcategory distribution.\"\"\"\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "def analyze_distribution(data):\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    category_counts = defaultdict(int)\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    subcategory_counts = defaultdict(int)\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    category_subcategory_counts = defaultdict(lambda: defaultdict(int))\n", 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -277,42 +257,35 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   }, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "cell_type": "code", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-   "execution_count": 68, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "execution_count": null, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "id": "2314e5a5-3f1c-432a-8a14-ccd36c0cbae4", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "metadata": {}, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "outputs": [], 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "source": [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "# Category mapping for merging\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "CATEGORY_MAPPING = {\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "    # Simulacra merging\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    'Simulacrum Agent': 'Simulacra Agents',\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    'Simulacra Agent': 'Simulacra Agents',\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "    # Outlines merging\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    'Outlines Agents': 'Outlines Agents',\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    'Outlines Agent': 'Outlines Agents',\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "    # Minecraft merging\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    'Minecraft Agent': 'Minecraft Agents',\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    'Voyager MineCraft Agent': 'Minecraft Agents',\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "    # Framework merging\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    'Agent Frameworks': 'Development Frameworks',\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    'Copilot Frameworks': 'Development Frameworks',\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "    # Utility agents merging\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    'AI Analysis Agent': 'Utility Agents',\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    'Code Analysis Agent': 'Utility Agents',\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    'File Management Agent': 'Utility Agents',\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    'Utility Function': 'Utility Agents',\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    'WebBrowser Agent': 'Utility Agents',\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "    # Data processing merging\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    'Data Structures': 'Data Processing Agents',\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    'Data Structure': 'Data Processing Agents',\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    'Data Compression': 'Data Processing Agents',\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "    # Keep major categories as is\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    'DSPy Agents': 'DSPy Agents',\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    'LLM Agents': 'LLM Agents',\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    'Instructor Agents': 'Instructor Agents',\n", 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -320,22 +293,17 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    'LlamaIndex Agents': 'LlamaIndex Agents',\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    'Langchain Agents': 'Langchain Agents',\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "}\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "# Default category for any not explicitly mapped\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "DEFAULT_CATEGORY = 'Other Agents'\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "def balance_agentic_dataset(target_size=25):\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "    # Load dataset\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    dataset = load_dataset('json', data_files='hermes-function-calling-v1/json-mode-agentic.json')\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "    # Group examples by mapped categories\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    category_groups = defaultdict(list)\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    for item in dataset['train']:\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "        original_category = item['category']\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "        mapped_category = CATEGORY_MAPPING.get(original_category, DEFAULT_CATEGORY)\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "        category_groups[mapped_category].append(item)\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "    # Print original distribution after mapping\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    print(\"\\nOriginal distribution after category mapping:\")\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    for cat, items in category_groups.items():\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "        print(f\"{cat}: {len(items)}\")\n", 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -344,7 +312,6 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    balanced_data = []\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    for category, items in category_groups.items():\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "        if len(items) > target_size:\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "            # Randomly sample target_size items\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "            sampled_items = random.sample(items, target_size)\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "            balanced_data.extend(sampled_items)\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "        else:\n", 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -364,23 +331,19 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   }, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "cell_type": "code", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-   "execution_count": 70, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "execution_count": null, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "id": "d7ebaf2d-fb69-4257-9879-0be534a9d03f", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "metadata": {}, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "outputs": [], 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "source": [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "def balance_agentic_dataset(target_size=25):\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "    # Load dataset\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    dataset = load_dataset('json', data_files='hermes-function-calling-v1/json-mode-agentic.json')\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "    \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "    # Group examples by mapped categories\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    category_groups = defaultdict(list)\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    for item in dataset['train']:\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "        original_category = item['category']\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "        mapped_category = CATEGORY_MAPPING.get(original_category, DEFAULT_CATEGORY)\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "        category_groups[mapped_category].append(item)\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "    \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "    # Print original distribution after mapping\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "        \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    print(\"\\nOriginal distribution after category mapping:\")\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    for cat, items in category_groups.items():\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "        print(f\"{cat}: {len(items)}\")\n", 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -389,14 +352,11 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    balanced_data = []\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    for category, items in category_groups.items():\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "        if len(items) > target_size:\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "            # Randomly sample target_size items\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "            sampled_items = random.sample(items, target_size)\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "            balanced_data.extend(sampled_items)\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "        else:\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "            # Keep all items if less than target_size\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "            balanced_data.extend(items)\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "    # Print final distribution\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    final_distribution = defaultdict(int)\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    for item in balanced_data:\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "        mapped_category = CATEGORY_MAPPING.get(item['category'], DEFAULT_CATEGORY)\n", 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -546,21 +506,18 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   }, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "cell_type": "code", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-   "execution_count": 87, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "execution_count": null, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "id": "b879509f-c8ed-42b4-9edd-b3b0056d2196", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "metadata": {}, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "outputs": [], 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "source": [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "def balance_func_dataset(target_size=25):\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "    # Load dataset\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    dataset = load_dataset('json', data_files='hermes-function-calling-v1/func-calling.json')\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "    # Group examples by category\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    category_groups = defaultdict(list)\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    for item in dataset['train']:\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "        category_groups[item['category']].append(item)\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "    # Print original distribution\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    print(\"\\nOriginal distribution:\")\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    for cat, items in category_groups.items():\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "        print(f\"{cat}: {len(items)}\")\n", 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -569,14 +526,11 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    balanced_data = []\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    for category, items in category_groups.items():\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "        if len(items) > target_size:\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "            # Randomly sample target_size items\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "            sampled_items = random.sample(items, target_size)\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "            balanced_data.extend(sampled_items)\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "        else:\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "            # Keep all items if less than target_size\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "            balanced_data.extend(items)\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "    # Print final distribution\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    final_distribution = defaultdict(int)\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    for item in balanced_data:\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "        final_distribution[item['category']] += 1\n", 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -595,7 +549,7 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   }, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "cell_type": "code", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-   "execution_count": 88, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "execution_count": null, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "id": "d1fc2cd0-0cdf-4e04-baa0-b90bae27a76a", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "metadata": {}, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "outputs": [ 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -715,20 +669,16 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "source": [ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "def save_as_hf_dataset(balanced_data, output_path='balanced_func_calling'):\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    \"\"\"Save the balanced dataset as a Hugging Face dataset.\"\"\"\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "    # Convert to Dataset format\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    balanced_dataset = Dataset.from_list(balanced_data)\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "    # Create DatasetDict with train split\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    dataset_dict = DatasetDict({\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "        'train': balanced_dataset\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    })\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "    # Save dataset\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    dataset_dict.save_to_disk(output_path)\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    print(f\"\\nSaved balanced dataset to {output_path}\")\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    return dataset_dict\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "# Run the balancing and save\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "balanced_data = balance_func_dataset(25)\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "dataset_dict = save_as_hf_dataset(balanced_data)" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    ] 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -743,7 +693,7 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   }, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				   { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "cell_type": "code", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-   "execution_count": 91, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+   "execution_count": null, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "id": "a2d8eec4-97df-4abb-b248-e9236a777fd6", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "metadata": {}, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				    "outputs": [ 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -845,34 +795,29 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "import math\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "def downsample_and_tag_dataset(dataset, source_name, target_total=150):\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "    # Group examples by category\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    category_groups = defaultdict(list)\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    for item in dataset['train']:\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "        category_groups[item['category']].append(item)\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    num_categories = len(category_groups)\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "    # Calculate samples per category to achieve target total\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    samples_per_category = max(1, math.floor(target_total / num_categories))\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    print(f\"\\n{source_name}:\")\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    print(f\"Number of categories: {num_categories}\")\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    print(f\"Samples per category: {samples_per_category}\")\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "    # Balance dataset\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    balanced_data = []\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    for category, items in category_groups.items():\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "        if len(items) > samples_per_category:\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "            sampled_items = random.sample(items, samples_per_category)\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "            balanced_data.extend(sampled_items)\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "        else:\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "            # For categories with fewer examples than target, keep all\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "            balanced_data.extend(items)\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    # Add source tag to each example\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    for item in balanced_data:\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "        item['dataset_source'] = source_name\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "    # Print distribution\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    final_distribution = defaultdict(int)\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    for item in balanced_data:\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "        final_distribution[item['category']] += 1\n", 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -887,34 +832,28 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    return balanced_data\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "def merge_and_save_datasets(target_per_dataset=150):\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "    # Load datasets\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    func_single = load_dataset('json', data_files='hermes-function-calling-v1/func-calling-singleturn.json')\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    json_single = load_dataset('json', data_files='hermes-function-calling-v1/json-mode-singleturn.json')\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "    # Print original sizes\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    print(f\"Original func_single size: {len(func_single['train'])}\")\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    print(f\"Original json_single size: {len(json_single['train'])}\")\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "    # Downsample and tag each dataset\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    func_balanced = downsample_and_tag_dataset(func_single, 'func_calling_singleturn', target_per_dataset)\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    json_balanced = downsample_and_tag_dataset(json_single, 'json_mode_singleturn', target_per_dataset)\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    # Merge datasets\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    merged_data = func_balanced + json_balanced\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "    # Create and save merged dataset\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    merged_dataset = Dataset.from_list(merged_data)\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    dataset_dict = DatasetDict({\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "        'train': merged_dataset\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    })\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "    \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "    # Print final statistics\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    "\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    print(\"\\nFinal merged dataset statistics:\")\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    print(f\"Total examples: {len(merged_data)}\")\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    print(f\"From func_calling_singleturn: {len(func_balanced)}\")\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    print(f\"From json_mode_singleturn: {len(json_balanced)}\")\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    "    # Save dataset\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    output_path = 'balanced_singleturn_merged'\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    dataset_dict.save_to_disk(output_path)\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     "    print(f\"\\nSaved merged dataset to {output_path}\")\n", 
			 |