|
@@ -2,25 +2,6 @@
|
|
|
"cells": [
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
- "execution_count": 14,
|
|
|
- "id": "96a4cb34-d8da-4b16-a929-98bfae2ac668",
|
|
|
- "metadata": {},
|
|
|
- "outputs": [
|
|
|
- {
|
|
|
- "name": "stdout",
|
|
|
- "output_type": "stream",
|
|
|
- "text": [
|
|
|
- "Updated Git hooks.\n",
|
|
|
- "Git LFS initialized.\n"
|
|
|
- ]
|
|
|
- }
|
|
|
- ],
|
|
|
- "source": [
|
|
|
- "!git lfs install"
|
|
|
- ]
|
|
|
- },
|
|
|
- {
|
|
|
- "cell_type": "code",
|
|
|
"execution_count": 15,
|
|
|
"id": "0978b512-1a88-447e-b80a-ee4b72152038",
|
|
|
"metadata": {},
|
|
@@ -46,13 +27,12 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
- "execution_count": 63,
|
|
|
+ "execution_count": null,
|
|
|
"id": "ec662394-2ae9-4081-87dd-bf75c4f76500",
|
|
|
"metadata": {},
|
|
|
"outputs": [],
|
|
|
"source": [
|
|
|
- "def analyze_distribution(data: List[Dict]) -> Tuple[Dict, Dict]:\n",
|
|
|
- " \"\"\"Analyze category and subcategory distribution.\"\"\"\n",
|
|
|
+ "def analyze_distribution(data):\n",
|
|
|
" category_counts = defaultdict(int)\n",
|
|
|
" subcategory_counts = defaultdict(int)\n",
|
|
|
" category_subcategory_counts = defaultdict(lambda: defaultdict(int))\n",
|
|
@@ -277,42 +257,35 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
- "execution_count": 68,
|
|
|
+ "execution_count": null,
|
|
|
"id": "2314e5a5-3f1c-432a-8a14-ccd36c0cbae4",
|
|
|
"metadata": {},
|
|
|
"outputs": [],
|
|
|
"source": [
|
|
|
"# Category mapping for merging\n",
|
|
|
"CATEGORY_MAPPING = {\n",
|
|
|
- " # Simulacra merging\n",
|
|
|
" 'Simulacrum Agent': 'Simulacra Agents',\n",
|
|
|
" 'Simulacra Agent': 'Simulacra Agents',\n",
|
|
|
" \n",
|
|
|
- " # Outlines merging\n",
|
|
|
" 'Outlines Agents': 'Outlines Agents',\n",
|
|
|
" 'Outlines Agent': 'Outlines Agents',\n",
|
|
|
" \n",
|
|
|
- " # Minecraft merging\n",
|
|
|
" 'Minecraft Agent': 'Minecraft Agents',\n",
|
|
|
" 'Voyager MineCraft Agent': 'Minecraft Agents',\n",
|
|
|
" \n",
|
|
|
- " # Framework merging\n",
|
|
|
" 'Agent Frameworks': 'Development Frameworks',\n",
|
|
|
" 'Copilot Frameworks': 'Development Frameworks',\n",
|
|
|
" \n",
|
|
|
- " # Utility agents merging\n",
|
|
|
" 'AI Analysis Agent': 'Utility Agents',\n",
|
|
|
" 'Code Analysis Agent': 'Utility Agents',\n",
|
|
|
" 'File Management Agent': 'Utility Agents',\n",
|
|
|
" 'Utility Function': 'Utility Agents',\n",
|
|
|
" 'WebBrowser Agent': 'Utility Agents',\n",
|
|
|
" \n",
|
|
|
- " # Data processing merging\n",
|
|
|
" 'Data Structures': 'Data Processing Agents',\n",
|
|
|
" 'Data Structure': 'Data Processing Agents',\n",
|
|
|
" 'Data Compression': 'Data Processing Agents',\n",
|
|
|
" \n",
|
|
|
- " # Keep major categories as is\n",
|
|
|
" 'DSPy Agents': 'DSPy Agents',\n",
|
|
|
" 'LLM Agents': 'LLM Agents',\n",
|
|
|
" 'Instructor Agents': 'Instructor Agents',\n",
|
|
@@ -320,22 +293,17 @@
|
|
|
" 'LlamaIndex Agents': 'LlamaIndex Agents',\n",
|
|
|
" 'Langchain Agents': 'Langchain Agents',\n",
|
|
|
"}\n",
|
|
|
- "\n",
|
|
|
- "# Default category for any not explicitly mapped\n",
|
|
|
"DEFAULT_CATEGORY = 'Other Agents'\n",
|
|
|
"\n",
|
|
|
"def balance_agentic_dataset(target_size=25):\n",
|
|
|
- " # Load dataset\n",
|
|
|
" dataset = load_dataset('json', data_files='hermes-function-calling-v1/json-mode-agentic.json')\n",
|
|
|
" \n",
|
|
|
- " # Group examples by mapped categories\n",
|
|
|
" category_groups = defaultdict(list)\n",
|
|
|
" for item in dataset['train']:\n",
|
|
|
" original_category = item['category']\n",
|
|
|
" mapped_category = CATEGORY_MAPPING.get(original_category, DEFAULT_CATEGORY)\n",
|
|
|
" category_groups[mapped_category].append(item)\n",
|
|
|
" \n",
|
|
|
- " # Print original distribution after mapping\n",
|
|
|
" print(\"\\nOriginal distribution after category mapping:\")\n",
|
|
|
" for cat, items in category_groups.items():\n",
|
|
|
" print(f\"{cat}: {len(items)}\")\n",
|
|
@@ -344,7 +312,6 @@
|
|
|
" balanced_data = []\n",
|
|
|
" for category, items in category_groups.items():\n",
|
|
|
" if len(items) > target_size:\n",
|
|
|
- " # Randomly sample target_size items\n",
|
|
|
" sampled_items = random.sample(items, target_size)\n",
|
|
|
" balanced_data.extend(sampled_items)\n",
|
|
|
" else:\n",
|
|
@@ -364,23 +331,19 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
- "execution_count": 70,
|
|
|
+ "execution_count": null,
|
|
|
"id": "d7ebaf2d-fb69-4257-9879-0be534a9d03f",
|
|
|
"metadata": {},
|
|
|
"outputs": [],
|
|
|
"source": [
|
|
|
"def balance_agentic_dataset(target_size=25):\n",
|
|
|
- " # Load dataset\n",
|
|
|
" dataset = load_dataset('json', data_files='hermes-function-calling-v1/json-mode-agentic.json')\n",
|
|
|
- " \n",
|
|
|
- " # Group examples by mapped categories\n",
|
|
|
" category_groups = defaultdict(list)\n",
|
|
|
" for item in dataset['train']:\n",
|
|
|
" original_category = item['category']\n",
|
|
|
" mapped_category = CATEGORY_MAPPING.get(original_category, DEFAULT_CATEGORY)\n",
|
|
|
" category_groups[mapped_category].append(item)\n",
|
|
|
- " \n",
|
|
|
- " # Print original distribution after mapping\n",
|
|
|
+ " \n",
|
|
|
" print(\"\\nOriginal distribution after category mapping:\")\n",
|
|
|
" for cat, items in category_groups.items():\n",
|
|
|
" print(f\"{cat}: {len(items)}\")\n",
|
|
@@ -389,14 +352,11 @@
|
|
|
" balanced_data = []\n",
|
|
|
" for category, items in category_groups.items():\n",
|
|
|
" if len(items) > target_size:\n",
|
|
|
- " # Randomly sample target_size items\n",
|
|
|
" sampled_items = random.sample(items, target_size)\n",
|
|
|
" balanced_data.extend(sampled_items)\n",
|
|
|
" else:\n",
|
|
|
- " # Keep all items if less than target_size\n",
|
|
|
" balanced_data.extend(items)\n",
|
|
|
" \n",
|
|
|
- " # Print final distribution\n",
|
|
|
" final_distribution = defaultdict(int)\n",
|
|
|
" for item in balanced_data:\n",
|
|
|
" mapped_category = CATEGORY_MAPPING.get(item['category'], DEFAULT_CATEGORY)\n",
|
|
@@ -546,21 +506,18 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
- "execution_count": 87,
|
|
|
+ "execution_count": null,
|
|
|
"id": "b879509f-c8ed-42b4-9edd-b3b0056d2196",
|
|
|
"metadata": {},
|
|
|
"outputs": [],
|
|
|
"source": [
|
|
|
"def balance_func_dataset(target_size=25):\n",
|
|
|
- " # Load dataset\n",
|
|
|
" dataset = load_dataset('json', data_files='hermes-function-calling-v1/func-calling.json')\n",
|
|
|
" \n",
|
|
|
- " # Group examples by category\n",
|
|
|
" category_groups = defaultdict(list)\n",
|
|
|
" for item in dataset['train']:\n",
|
|
|
" category_groups[item['category']].append(item)\n",
|
|
|
" \n",
|
|
|
- " # Print original distribution\n",
|
|
|
" print(\"\\nOriginal distribution:\")\n",
|
|
|
" for cat, items in category_groups.items():\n",
|
|
|
" print(f\"{cat}: {len(items)}\")\n",
|
|
@@ -569,14 +526,11 @@
|
|
|
" balanced_data = []\n",
|
|
|
" for category, items in category_groups.items():\n",
|
|
|
" if len(items) > target_size:\n",
|
|
|
- " # Randomly sample target_size items\n",
|
|
|
" sampled_items = random.sample(items, target_size)\n",
|
|
|
" balanced_data.extend(sampled_items)\n",
|
|
|
" else:\n",
|
|
|
- " # Keep all items if less than target_size\n",
|
|
|
" balanced_data.extend(items)\n",
|
|
|
" \n",
|
|
|
- " # Print final distribution\n",
|
|
|
" final_distribution = defaultdict(int)\n",
|
|
|
" for item in balanced_data:\n",
|
|
|
" final_distribution[item['category']] += 1\n",
|
|
@@ -595,7 +549,7 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
- "execution_count": 88,
|
|
|
+ "execution_count": null,
|
|
|
"id": "d1fc2cd0-0cdf-4e04-baa0-b90bae27a76a",
|
|
|
"metadata": {},
|
|
|
"outputs": [
|
|
@@ -715,20 +669,16 @@
|
|
|
"source": [
|
|
|
"def save_as_hf_dataset(balanced_data, output_path='balanced_func_calling'):\n",
|
|
|
" \"\"\"Save the balanced dataset as a Hugging Face dataset.\"\"\"\n",
|
|
|
- " # Convert to Dataset format\n",
|
|
|
" balanced_dataset = Dataset.from_list(balanced_data)\n",
|
|
|
" \n",
|
|
|
- " # Create DatasetDict with train split\n",
|
|
|
" dataset_dict = DatasetDict({\n",
|
|
|
" 'train': balanced_dataset\n",
|
|
|
" })\n",
|
|
|
" \n",
|
|
|
- " # Save dataset\n",
|
|
|
" dataset_dict.save_to_disk(output_path)\n",
|
|
|
" print(f\"\\nSaved balanced dataset to {output_path}\")\n",
|
|
|
" return dataset_dict\n",
|
|
|
"\n",
|
|
|
- "# Run the balancing and save\n",
|
|
|
"balanced_data = balance_func_dataset(25)\n",
|
|
|
"dataset_dict = save_as_hf_dataset(balanced_data)"
|
|
|
]
|
|
@@ -743,7 +693,7 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
- "execution_count": 91,
|
|
|
+ "execution_count": null,
|
|
|
"id": "a2d8eec4-97df-4abb-b248-e9236a777fd6",
|
|
|
"metadata": {},
|
|
|
"outputs": [
|
|
@@ -845,34 +795,29 @@
|
|
|
"import math\n",
|
|
|
"\n",
|
|
|
"def downsample_and_tag_dataset(dataset, source_name, target_total=150):\n",
|
|
|
- " # Group examples by category\n",
|
|
|
" category_groups = defaultdict(list)\n",
|
|
|
" for item in dataset['train']:\n",
|
|
|
" category_groups[item['category']].append(item)\n",
|
|
|
" \n",
|
|
|
" num_categories = len(category_groups)\n",
|
|
|
- " # Calculate samples per category to achieve target total\n",
|
|
|
" samples_per_category = max(1, math.floor(target_total / num_categories))\n",
|
|
|
" \n",
|
|
|
" print(f\"\\n{source_name}:\")\n",
|
|
|
" print(f\"Number of categories: {num_categories}\")\n",
|
|
|
" print(f\"Samples per category: {samples_per_category}\")\n",
|
|
|
" \n",
|
|
|
- " # Balance dataset\n",
|
|
|
" balanced_data = []\n",
|
|
|
" for category, items in category_groups.items():\n",
|
|
|
" if len(items) > samples_per_category:\n",
|
|
|
" sampled_items = random.sample(items, samples_per_category)\n",
|
|
|
" balanced_data.extend(sampled_items)\n",
|
|
|
" else:\n",
|
|
|
- " # For categories with fewer examples than target, keep all\n",
|
|
|
" balanced_data.extend(items)\n",
|
|
|
" \n",
|
|
|
" # Add source tag to each example\n",
|
|
|
" for item in balanced_data:\n",
|
|
|
" item['dataset_source'] = source_name\n",
|
|
|
" \n",
|
|
|
- " # Print distribution\n",
|
|
|
" final_distribution = defaultdict(int)\n",
|
|
|
" for item in balanced_data:\n",
|
|
|
" final_distribution[item['category']] += 1\n",
|
|
@@ -887,34 +832,28 @@
|
|
|
" return balanced_data\n",
|
|
|
"\n",
|
|
|
"def merge_and_save_datasets(target_per_dataset=150):\n",
|
|
|
- " # Load datasets\n",
|
|
|
" func_single = load_dataset('json', data_files='hermes-function-calling-v1/func-calling-singleturn.json')\n",
|
|
|
" json_single = load_dataset('json', data_files='hermes-function-calling-v1/json-mode-singleturn.json')\n",
|
|
|
" \n",
|
|
|
- " # Print original sizes\n",
|
|
|
" print(f\"Original func_single size: {len(func_single['train'])}\")\n",
|
|
|
" print(f\"Original json_single size: {len(json_single['train'])}\")\n",
|
|
|
" \n",
|
|
|
- " # Downsample and tag each dataset\n",
|
|
|
" func_balanced = downsample_and_tag_dataset(func_single, 'func_calling_singleturn', target_per_dataset)\n",
|
|
|
" json_balanced = downsample_and_tag_dataset(json_single, 'json_mode_singleturn', target_per_dataset)\n",
|
|
|
" \n",
|
|
|
" # Merge datasets\n",
|
|
|
" merged_data = func_balanced + json_balanced\n",
|
|
|
" \n",
|
|
|
- " # Create and save merged dataset\n",
|
|
|
" merged_dataset = Dataset.from_list(merged_data)\n",
|
|
|
" dataset_dict = DatasetDict({\n",
|
|
|
" 'train': merged_dataset\n",
|
|
|
" })\n",
|
|
|
- " \n",
|
|
|
- " # Print final statistics\n",
|
|
|
+ "\n",
|
|
|
" print(\"\\nFinal merged dataset statistics:\")\n",
|
|
|
" print(f\"Total examples: {len(merged_data)}\")\n",
|
|
|
" print(f\"From func_calling_singleturn: {len(func_balanced)}\")\n",
|
|
|
" print(f\"From json_mode_singleturn: {len(json_balanced)}\")\n",
|
|
|
" \n",
|
|
|
- " # Save dataset\n",
|
|
|
" output_path = 'balanced_singleturn_merged'\n",
|
|
|
" dataset_dict.save_to_disk(output_path)\n",
|
|
|
" print(f\"\\nSaved merged dataset to {output_path}\")\n",
|