{
"cells": [
{
"cell_type": "code",
"execution_count": 15,
"id": "0978b512-1a88-447e-b80a-ee4b72152038",
"metadata": {},
"outputs": [],
"source": [
"!cd hermes-function-calling-v1\n",
"!git lfs pull"
]
},
{
"cell_type": "code",
"execution_count": 81,
"id": "acb50ef7-fa53-438d-bce7-1eebf7bc0ba5",
"metadata": {},
"outputs": [],
"source": [
"from datasets import load_dataset, Dataset, DatasetDict\n",
"from collections import defaultdict\n",
"import pandas as pd\n",
"import random\n",
"import datasets"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ec662394-2ae9-4081-87dd-bf75c4f76500",
"metadata": {},
"outputs": [],
"source": [
"def analyze_distribution(data):\n",
" category_counts = defaultdict(int)\n",
" subcategory_counts = defaultdict(int)\n",
" category_subcategory_counts = defaultdict(lambda: defaultdict(int))\n",
" \n",
" for item in data:\n",
" category = item['category']\n",
" subcategory = item['subcategory']\n",
" \n",
" category_counts[category] += 1\n",
" subcategory_counts[subcategory] += 1\n",
" category_subcategory_counts[category][subcategory] += 1\n",
" \n",
" return dict(category_counts), dict(subcategory_counts), dict(category_subcategory_counts)\n"
]
},
{
"cell_type": "code",
"execution_count": 93,
"id": "63ff9ec7-f68a-4344-acf4-e99efbdd4e7c",
"metadata": {},
"outputs": [],
"source": [
"# Load each dataset\n",
"agentic = load_dataset('json', data_files='hermes-function-calling-v1/json-mode-agentic.json')\n",
"singleturn = load_dataset('json', data_files='hermes-function-calling-v1/json-mode-singleturn.json')\n",
"func_calling = load_dataset('json', data_files='hermes-function-calling-v1/func-calling.json')\n",
"func_single = load_dataset('json', data_files='hermes-function-calling-v1/func-calling-singleturn.json')\n",
"glaive_ds = load_dataset('json', data_files='hermes-function-calling-v1/glaive-function-calling-5k.json')"
]
},
{
"cell_type": "code",
"execution_count": 64,
"id": "b4ded1b5-bc3c-4719-9b46-0a57f7169a6f",
"metadata": {},
"outputs": [],
"source": [
"def get_distributions(dataset, name):\n",
" cat_counts = defaultdict(int)\n",
" \n",
" for item in dataset['train']:\n",
" cat = item['category']\n",
" cat_counts[cat] += 1\n",
" \n",
" print(f\"\\n=== {name} Distribution ===\")\n",
" df_cat = pd.DataFrame(list(cat_counts.items()), columns=['Category', 'Count'])\n",
" df_cat['Percentage'] = df_cat['Count'] / df_cat['Count'].sum() * 100\n",
" print(df_cat.sort_values('Count', ascending=False))\n",
" return dict(cat_counts)"
]
},
{
"cell_type": "code",
"execution_count": 94,
"id": "cf2e0f61-2090-4691-a1c1-00d015e5fab6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Analyzing individual datasets:\n",
"\n",
"=== Agentic Distribution ===\n",
" Category Count Percentage\n",
"24 DSPy Agents 457 34.053651\n",
"26 LLM Agents 93 6.929955\n",
"8 Simulacrum Agent 71 5.290611\n",
"39 Instructor Agents 68 5.067064\n",
"23 Agent Frameworks 64 4.769001\n",
"11 JSON Schema 59 4.396423\n",
"0 Simulacra Agent 59 4.396423\n",
"10 Copilot Frameworks 47 3.502235\n",
"37 Autogen Agents 45 3.353204\n",
"41 LlamaIndex Agents 44 3.278689\n",
"33 Langchain Agents 40 2.980626\n",
"1 Embodied Agent 38 2.831595\n",
"29 Voyager MineCraft Agent 36 2.682563\n",
"32 Copilot Agents 31 2.309985\n",
"35 Guardrails Agent 28 2.086438\n",
"18 WebBrowser Agent 26 1.937407\n",
"31 FuncChain Agents 23 1.713860\n",
"30 AutoGPT Agent 21 1.564829\n",
"16 Minecraft Agent 18 1.341282\n",
"15 LangGraph Agents 15 1.117735\n",
"25 Outlines Agents 13 0.968703\n",
"19 Outlines Agent 13 0.968703\n",
"12 Coding Agents 5 0.372578\n",
"21 Guidance Agents 5 0.372578\n",
"13 Gollie Agents 4 0.298063\n",
"27 CrewAI Agent 3 0.223547\n",
"9 AI-powered Task Management System 1 0.074516\n",
"7 AI Analysis Agent 1 0.074516\n",
"6 Image Generation Agent 1 0.074516\n",
"5 Utility Function 1 0.074516\n",
"3 Data Structures 1 0.074516\n",
"4 Data Compression 1 0.074516\n",
"2 Data Structure 1 0.074516\n",
"17 Music Agent 1 0.074516\n",
"22 System 1 0.074516\n",
"20 CrewAI Integration 1 0.074516\n",
"14 BabyAGI Classic 1 0.074516\n",
"28 Autonomous Agent 1 0.074516\n",
"36 CrewAI Agent Customization 1 0.074516\n",
"34 Code Analysis Agent 1 0.074516\n",
"38 File Management Agent 1 0.074516\n",
"40 Multion API Interaction 1 0.074516\n",
"\n",
"=== Singleturn Distribution ===\n",
" Category Count Percentage\n",
"3 Materials 147 11.845286\n",
"18 Financial Services (New Name) 93 7.493956\n",
"22 Consumer Discretionary Distribution & Retail (... 92 7.413376\n",
"1 Capital Goods 88 7.091056\n",
"2 Transportation 86 6.929895\n",
"5 Consumer Durables & Apparel 79 6.365834\n",
"6 Commercial & Professional Services 75 6.043513\n",
"7 Energy 67 5.398872\n",
"21 Health Care Equipment & Services 63 5.076551\n",
"15 Consumer Services 52 4.190169\n",
"10 Food, Beverage & Tobacco 52 4.190169\n",
"17 Technology Hardware & Equipment 47 3.787268\n",
"4 Software & Services 43 3.464948\n",
"20 Insurance 40 3.223207\n",
"16 Consumer Staples Distribution & Retail (New Name) 33 2.659146\n",
"13 Automobiles & Components 32 2.578566\n",
"0 Pharmaceuticals, Biotechnology & Life Sciences 26 2.095085\n",
"8 Media & Entertainment 25 2.014504\n",
"11 Telecommunication Services 25 2.014504\n",
"19 Banks 25 2.014504\n",
"9 JSON Schema 19 1.531023\n",
"12 Semiconductors & Semiconductor Equipment 17 1.369863\n",
"14 Household & Personal Products 15 1.208703\n",
"\n",
"=== Function Calling Distribution ===\n",
" Category Count Percentage\n",
"9 Information Extraction 796 42.049657\n",
"47 API Call 119 6.286318\n",
"40 Industrial Software 65 3.433703\n",
"24 Utilities Software 52 2.746962\n",
"38 Robotic Process Automation (RPA) 51 2.694136\n",
".. ... ... ...\n",
"59 SAP 2 0.105652\n",
"18 Document Ranking 1 0.052826\n",
"20 Marketing 1 0.052826\n",
"45 Business Intelligence 1 0.052826\n",
"50 Topic Modelling 1 0.052826\n",
"\n",
"[63 rows x 3 columns]\n",
"\n",
"=== Function Calling Singleturn Distribution ===\n",
" Category Count Percentage\n",
"9 Information Extraction 796 42.049657\n",
"47 API Call 119 6.286318\n",
"40 Industrial Software 65 3.433703\n",
"24 Utilities Software 52 2.746962\n",
"38 Robotic Process Automation (RPA) 51 2.694136\n",
".. ... ... ...\n",
"59 SAP 2 0.105652\n",
"18 Document Ranking 1 0.052826\n",
"20 Marketing 1 0.052826\n",
"45 Business Intelligence 1 0.052826\n",
"50 Topic Modelling 1 0.052826\n",
"\n",
"[63 rows x 3 columns]\n",
"\n",
"=== Glaive dataset balance Distribution ===\n",
" Category Count Percentage\n",
"0 Stocks and Orders 645 12.382415\n",
"1 Movie 579 11.115377\n",
"2 Flight Services 551 10.577846\n",
"3 Request Management 547 10.501056\n",
"4 Loan and Financial Calculations 429 8.235746\n",
"5 Location Services 357 6.853523\n",
"6 Productivity 350 6.719140\n",
"7 Recipe Collections 313 6.008831\n",
"8 Mathematics 156 2.994817\n",
"9 Machine Learning 132 2.534076\n",
"10 History and Culture 129 2.476483\n",
"11 Book Search 126 2.418890\n",
"12 Exploration 116 2.226915\n",
"13 Language and Linguistics 92 1.766174\n",
"14 Natural Language Processing 83 1.593396\n",
"15 Data Analysis and Programming 79 1.516606\n",
"16 Web Development and Design 69 1.324630\n",
"17 Science and Nature Exploration 49 0.940680\n",
"18 Database and SQL 43 0.825494\n",
"19 Technology 42 0.806297\n",
"20 Business Strategies 41 0.787099\n",
"21 Programming Concepts 39 0.748704\n",
"22 Science Education 32 0.614321\n",
"23 Puzzle and Problem-Solving 27 0.518334\n",
"24 Language and Logic 27 0.518334\n",
"25 Programming and Computer Science Questions 24 0.460741\n",
"26 Climate and Environmental Solutions 24 0.460741\n",
"27 Literary Analysis 23 0.441543\n",
"28 Cybersecurity and Encryption 21 0.403148\n",
"29 Data Science 20 0.383951\n",
"30 Finance & Economics 16 0.307161\n",
"31 Swift Programming 15 0.287963\n",
"32 Quantum Physics 13 0.249568\n"
]
}
],
"source": [
"print(\"\\nAnalyzing individual datasets:\")\n",
"agentic_dist = get_distributions(agentic, \"Agentic\")\n",
"singleturn_dist = get_distributions(singleturn, \"Singleturn\")\n",
"func_dist = get_distributions(func_calling, \"Function Calling\")\n",
"func_single_dist = get_distributions(func_single, \"Function Calling Singleturn\")\n",
"glaive_ds_dist = get_distributions(glaive_ds, \"Glaive dataset balance\")"
]
},
{
"cell_type": "markdown",
"id": "8cf021ad-830e-4756-af07-b1afd7db2f39",
"metadata": {},
"source": [
"## lets start with agentic"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2314e5a5-3f1c-432a-8a14-ccd36c0cbae4",
"metadata": {},
"outputs": [],
"source": [
"# Category mapping for merging\n",
"CATEGORY_MAPPING = {\n",
" 'Simulacrum Agent': 'Simulacra Agents',\n",
" 'Simulacra Agent': 'Simulacra Agents',\n",
" \n",
" 'Outlines Agents': 'Outlines Agents',\n",
" 'Outlines Agent': 'Outlines Agents',\n",
" \n",
" 'Minecraft Agent': 'Minecraft Agents',\n",
" 'Voyager MineCraft Agent': 'Minecraft Agents',\n",
" \n",
" 'Agent Frameworks': 'Development Frameworks',\n",
" 'Copilot Frameworks': 'Development Frameworks',\n",
" \n",
" 'AI Analysis Agent': 'Utility Agents',\n",
" 'Code Analysis Agent': 'Utility Agents',\n",
" 'File Management Agent': 'Utility Agents',\n",
" 'Utility Function': 'Utility Agents',\n",
" 'WebBrowser Agent': 'Utility Agents',\n",
" \n",
" 'Data Structures': 'Data Processing Agents',\n",
" 'Data Structure': 'Data Processing Agents',\n",
" 'Data Compression': 'Data Processing Agents',\n",
" \n",
" 'DSPy Agents': 'DSPy Agents',\n",
" 'LLM Agents': 'LLM Agents',\n",
" 'Instructor Agents': 'Instructor Agents',\n",
" 'Autogen Agents': 'Autogen Agents',\n",
" 'LlamaIndex Agents': 'LlamaIndex Agents',\n",
" 'Langchain Agents': 'Langchain Agents',\n",
"}\n",
"DEFAULT_CATEGORY = 'Other Agents'\n",
"\n",
"def balance_agentic_dataset(target_size=25):\n",
" dataset = load_dataset('json', data_files='hermes-function-calling-v1/json-mode-agentic.json')\n",
" \n",
" category_groups = defaultdict(list)\n",
" for item in dataset['train']:\n",
" original_category = item['category']\n",
" mapped_category = CATEGORY_MAPPING.get(original_category, DEFAULT_CATEGORY)\n",
" category_groups[mapped_category].append(item)\n",
" \n",
" print(\"\\nOriginal distribution after category mapping:\")\n",
" for cat, items in category_groups.items():\n",
" print(f\"{cat}: {len(items)}\")\n",
" \n",
" # Balance dataset\n",
" balanced_data = []\n",
" for category, items in category_groups.items():\n",
" if len(items) > target_size:\n",
" sampled_items = random.sample(items, target_size)\n",
" balanced_data.extend(sampled_items)\n",
" else:\n",
" # Keep all items if less than target_size\n",
" balanced_data.extend(items)"
]
},
{
"cell_type": "code",
"execution_count": 69,
"id": "13a4bbb3-30d2-42a2-b5ef-4c77fc7aa90f",
"metadata": {},
"outputs": [],
"source": [
"DEFAULT_CATEGORY = 'Other Agents'\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d7ebaf2d-fb69-4257-9879-0be534a9d03f",
"metadata": {},
"outputs": [],
"source": [
"def balance_agentic_dataset(target_size=25):\n",
" dataset = load_dataset('json', data_files='hermes-function-calling-v1/json-mode-agentic.json')\n",
" category_groups = defaultdict(list)\n",
" for item in dataset['train']:\n",
" original_category = item['category']\n",
" mapped_category = CATEGORY_MAPPING.get(original_category, DEFAULT_CATEGORY)\n",
" category_groups[mapped_category].append(item)\n",
" \n",
" print(\"\\nOriginal distribution after category mapping:\")\n",
" for cat, items in category_groups.items():\n",
" print(f\"{cat}: {len(items)}\")\n",
" \n",
" # Balance dataset\n",
" balanced_data = []\n",
" for category, items in category_groups.items():\n",
" if len(items) > target_size:\n",
" sampled_items = random.sample(items, target_size)\n",
" balanced_data.extend(sampled_items)\n",
" else:\n",
" balanced_data.extend(items)\n",
" \n",
" final_distribution = defaultdict(int)\n",
" for item in balanced_data:\n",
" mapped_category = CATEGORY_MAPPING.get(item['category'], DEFAULT_CATEGORY)\n",
" final_distribution[mapped_category] += 1\n",
" \n",
" print(\"\\nFinal distribution:\")\n",
" df_final = pd.DataFrame(list(final_distribution.items()), \n",
" columns=['Category', 'Count'])\n",
" df_final['Percentage'] = df_final['Count'] / len(balanced_data) * 100\n",
" print(df_final.sort_values('Count', ascending=False))\n",
" \n",
" print(f\"\\nOriginal dataset size: {len(dataset['train'])}\")\n",
" print(f\"Balanced dataset size: {len(balanced_data)}\")\n",
" \n",
" return balanced_data\n"
]
},
{
"cell_type": "code",
"execution_count": 71,
"id": "f77af624-232a-4802-80f9-93fd458cc573",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Original distribution after category mapping:\n",
"Simulacra Agents: 130\n",
"Other Agents: 241\n",
"Data Processing Agents: 3\n",
"Utility Agents: 30\n",
"Development Frameworks: 111\n",
"Minecraft Agents: 54\n",
"Outlines Agents: 26\n",
"DSPy Agents: 457\n",
"LLM Agents: 93\n",
"Langchain Agents: 40\n",
"Autogen Agents: 45\n",
"Instructor Agents: 68\n",
"LlamaIndex Agents: 44\n",
"\n",
"Final distribution:\n",
" Category Count Percentage\n",
"0 Simulacra Agents 25 8.250825\n",
"1 Other Agents 25 8.250825\n",
"3 Utility Agents 25 8.250825\n",
"4 Development Frameworks 25 8.250825\n",
"5 Minecraft Agents 25 8.250825\n",
"9 Langchain Agents 25 8.250825\n",
"6 Outlines Agents 25 8.250825\n",
"7 DSPy Agents 25 8.250825\n",
"8 LLM Agents 25 8.250825\n",
"11 Instructor Agents 25 8.250825\n",
"10 Autogen Agents 25 8.250825\n",
"12 LlamaIndex Agents 25 8.250825\n",
"2 Data Processing Agents 3 0.990099\n",
"\n",
"Original dataset size: 1342\n",
"Balanced dataset size: 303\n"
]
}
],
"source": [
"balanced_data = balance_agentic_dataset(25)"
]
},
{
"cell_type": "code",
"execution_count": 82,
"id": "105933dd-5b05-41cf-8e61-cbe9abdc82de",
"metadata": {},
"outputs": [],
"source": [
"balanced_dataset = Dataset.from_list(balanced_data)"
]
},
{
"cell_type": "code",
"execution_count": 83,
"id": "48674a73-227c-4a23-aa27-72bed628f0d9",
"metadata": {},
"outputs": [],
"source": [
"dataset_dict = DatasetDict({\n",
" 'train': balanced_dataset\n",
" })"
]
},
{
"cell_type": "code",
"execution_count": 84,
"id": "0b9fd38b-c600-4f3e-9bcf-741340ae9218",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "718bda35da8b4d06a788be51d2234d8b",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Saving the dataset (0/1 shards): 0%| | 0/303 [00:00, ? examples/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"dataset_dict.save_to_disk(\"balanced-json-modeagentic\")"
]
},
{
"cell_type": "code",
"execution_count": 86,
"id": "e7c52093-c1ea-4a0d-a469-86571cca8784",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Dataset({\n",
" features: ['id', 'conversations', 'category', 'subcategory', 'schema'],\n",
" num_rows: 303\n",
"})"
]
},
"execution_count": 86,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset_dict['train']"
]
},
{
"cell_type": "markdown",
"id": "a7c8acd0-188b-4897-b8e3-57f4024428bb",
"metadata": {},
"source": [
"## Func-Calling"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b879509f-c8ed-42b4-9edd-b3b0056d2196",
"metadata": {},
"outputs": [],
"source": [
"def balance_func_dataset(target_size=25):\n",
" dataset = load_dataset('json', data_files='hermes-function-calling-v1/func-calling.json')\n",
" \n",
" category_groups = defaultdict(list)\n",
" for item in dataset['train']:\n",
" category_groups[item['category']].append(item)\n",
" \n",
" print(\"\\nOriginal distribution:\")\n",
" for cat, items in category_groups.items():\n",
" print(f\"{cat}: {len(items)}\")\n",
" \n",
" # Balance dataset - cap at target_size but keep smaller categories as is\n",
" balanced_data = []\n",
" for category, items in category_groups.items():\n",
" if len(items) > target_size:\n",
" sampled_items = random.sample(items, target_size)\n",
" balanced_data.extend(sampled_items)\n",
" else:\n",
" balanced_data.extend(items)\n",
" \n",
" final_distribution = defaultdict(int)\n",
" for item in balanced_data:\n",
" final_distribution[item['category']] += 1\n",
" \n",
" print(\"\\nFinal distribution:\")\n",
" df_final = pd.DataFrame(list(final_distribution.items()), \n",
" columns=['Category', 'Count'])\n",
" df_final['Percentage'] = df_final['Count'] / len(balanced_data) * 100\n",
" print(df_final.sort_values('Count', ascending=False))\n",
" \n",
" print(f\"\\nOriginal dataset size: {len(dataset['train'])}\")\n",
" print(f\"Balanced dataset size: {len(balanced_data)}\")\n",
" \n",
" return balanced_data"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d1fc2cd0-0cdf-4e04-baa0-b90bae27a76a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Original distribution:\n",
"IoT and Home Automation: 15\n",
"Quantum Computing: 8\n",
"Services Industry Software: 17\n",
"IoT Platforms: 8\n",
"Communication Services Software: 37\n",
"Materials Software: 9\n",
"E-commerce Platforms: 23\n",
"Blockchain Integration: 5\n",
"Data Centers and High Performance Computing: 5\n",
"Information Extraction: 796\n",
"Data Privacy: 10\n",
"Annotation: 4\n",
"Model APIs: 23\n",
"Project Management: 3\n",
"Financial Services Apps: 29\n",
"Decentralized Apps (DApps): 3\n",
"Use Apps: 37\n",
"Office Administration: 4\n",
"Document Ranking: 1\n",
"CRM: 3\n",
"Marketing: 1\n",
"Code Interpreters: 9\n",
"Algorithmic Trading: 39\n",
"Energy Software: 9\n",
"Utilities Software: 52\n",
"Information Technology Software: 35\n",
"Data Processing: 19\n",
"Database Management: 10\n",
"Operating System Functions: 14\n",
"Text Classification: 3\n",
"Robotics and Automation: 8\n",
"Networking and Cybersecurity: 6\n",
"Consumer Discretionary Software: 25\n",
"Named Entity Recognition: 8\n",
"Information Retrieval (RAG): 20\n",
"Productivity Tools Integration: 8\n",
"Web APIs: 9\n",
"Low-Code Enterprise Platforms: 19\n",
"Robotic Process Automation (RPA): 51\n",
"DevOps: 3\n",
"Industrial Software: 65\n",
"Voice Assistants: 10\n",
"Healthcare Software: 40\n",
"Desktop Applications: 33\n",
"OpenAI API Integration: 29\n",
"Business Intelligence: 1\n",
"Financial Software: 46\n",
"API Call: 119\n",
"SaaS Platforms: 28\n",
"Relation Extraction: 3\n",
"Topic Modelling: 1\n",
"Compliance: 3\n",
"Consumer Staples Software: 29\n",
"Real Estate Software: 9\n",
"AI Model Integration: 12\n",
"Mobile Applications: 34\n",
"Web Browser Agent: 11\n",
"Git Operations: 5\n",
"Identity and Access Management (IAM): 3\n",
"SAP: 2\n",
"HR: 4\n",
"Cloud Platforms: 16\n",
"Accounting & Finance: 2\n",
"\n",
"Final distribution:\n",
" Category Count Percentage\n",
"4 Communication Services Software 25 3.052503\n",
"14 Financial Services Apps 25 3.052503\n",
"9 Information Extraction 25 3.052503\n",
"24 Utilities Software 25 3.052503\n",
"25 Information Technology Software 25 3.052503\n",
".. ... ... ...\n",
"59 SAP 2 0.244200\n",
"18 Document Ranking 1 0.122100\n",
"20 Marketing 1 0.122100\n",
"45 Business Intelligence 1 0.122100\n",
"50 Topic Modelling 1 0.122100\n",
"\n",
"[63 rows x 3 columns]\n",
"\n",
"Original dataset size: 1893\n",
"Balanced dataset size: 819\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "ad046fb8ab7943a7971785ef5c33f41b",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Saving the dataset (0/1 shards): 0%| | 0/819 [00:00, ? examples/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Saved balanced dataset to balanced_func_calling\n"
]
}
],
"source": [
"def save_as_hf_dataset(balanced_data, output_path='balanced_func_calling'):\n",
" \"\"\"Save the balanced dataset as a Hugging Face dataset.\"\"\"\n",
" balanced_dataset = Dataset.from_list(balanced_data)\n",
" \n",
" dataset_dict = DatasetDict({\n",
" 'train': balanced_dataset\n",
" })\n",
" \n",
" dataset_dict.save_to_disk(output_path)\n",
" print(f\"\\nSaved balanced dataset to {output_path}\")\n",
" return dataset_dict\n",
"\n",
"balanced_data = balance_func_dataset(25)\n",
"dataset_dict = save_as_hf_dataset(balanced_data)"
]
},
{
"cell_type": "markdown",
"id": "82d2b786-a822-457f-96b5-e02e1730be01",
"metadata": {},
"source": [
"## Merge both Single-Turn"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a2d8eec4-97df-4abb-b248-e9236a777fd6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Original func_single size: 1893\n",
"Original json_single size: 1241\n",
"\n",
"func_calling_singleturn:\n",
"Number of categories: 63\n",
"Samples per category: 2\n",
"\n",
"Distribution for func_calling_singleturn:\n",
" Category Count Percentage\n",
"0 IoT and Home Automation 2 1.639344\n",
"1 Quantum Computing 2 1.639344\n",
"2 Services Industry Software 2 1.639344\n",
"3 IoT Platforms 2 1.639344\n",
"4 Communication Services Software 2 1.639344\n",
".. ... ... ...\n",
"58 Identity and Access Management (IAM) 2 1.639344\n",
"18 Document Ranking 1 0.819672\n",
"20 Marketing 1 0.819672\n",
"45 Business Intelligence 1 0.819672\n",
"50 Topic Modelling 1 0.819672\n",
"\n",
"[63 rows x 3 columns]\n",
"\n",
"func_calling_singleturn final size: 122\n",
"\n",
"json_mode_singleturn:\n",
"Number of categories: 23\n",
"Samples per category: 6\n",
"\n",
"Distribution for json_mode_singleturn:\n",
" Category Count Percentage\n",
"0 Pharmaceuticals, Biotechnology & Life Sciences 6 4.347826\n",
"1 Capital Goods 6 4.347826\n",
"2 Transportation 6 4.347826\n",
"3 Materials 6 4.347826\n",
"4 Software & Services 6 4.347826\n",
"5 Consumer Durables & Apparel 6 4.347826\n",
"6 Commercial & Professional Services 6 4.347826\n",
"7 Energy 6 4.347826\n",
"8 Media & Entertainment 6 4.347826\n",
"9 JSON Schema 6 4.347826\n",
"10 Food, Beverage & Tobacco 6 4.347826\n",
"11 Telecommunication Services 6 4.347826\n",
"12 Semiconductors & Semiconductor Equipment 6 4.347826\n",
"13 Automobiles & Components 6 4.347826\n",
"14 Household & Personal Products 6 4.347826\n",
"15 Consumer Services 6 4.347826\n",
"16 Consumer Staples Distribution & Retail (New Name) 6 4.347826\n",
"17 Technology Hardware & Equipment 6 4.347826\n",
"18 Financial Services (New Name) 6 4.347826\n",
"19 Banks 6 4.347826\n",
"20 Insurance 6 4.347826\n",
"21 Health Care Equipment & Services 6 4.347826\n",
"22 Consumer Discretionary Distribution & Retail (... 6 4.347826\n",
"\n",
"json_mode_singleturn final size: 138\n",
"\n",
"Final merged dataset statistics:\n",
"Total examples: 260\n",
"From func_calling_singleturn: 122\n",
"From json_mode_singleturn: 138\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "3d8cb0044406475cb114f1f6559c93ab",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Saving the dataset (0/1 shards): 0%| | 0/260 [00:00, ? examples/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Saved merged dataset to balanced_singleturn_merged\n"
]
}
],
"source": [
"from datasets import load_dataset, Dataset, DatasetDict\n",
"import random\n",
"from collections import defaultdict\n",
"import pandas as pd\n",
"import math\n",
"\n",
"def downsample_and_tag_dataset(dataset, source_name, target_total=150):\n",
" category_groups = defaultdict(list)\n",
" for item in dataset['train']:\n",
" category_groups[item['category']].append(item)\n",
" \n",
" num_categories = len(category_groups)\n",
" samples_per_category = max(1, math.floor(target_total / num_categories))\n",
" \n",
" print(f\"\\n{source_name}:\")\n",
" print(f\"Number of categories: {num_categories}\")\n",
" print(f\"Samples per category: {samples_per_category}\")\n",
" \n",
" balanced_data = []\n",
" for category, items in category_groups.items():\n",
" if len(items) > samples_per_category:\n",
" sampled_items = random.sample(items, samples_per_category)\n",
" balanced_data.extend(sampled_items)\n",
" else:\n",
" balanced_data.extend(items)\n",
" \n",
" # Add source tag to each example\n",
" for item in balanced_data:\n",
" item['dataset_source'] = source_name\n",
" \n",
" final_distribution = defaultdict(int)\n",
" for item in balanced_data:\n",
" final_distribution[item['category']] += 1\n",
" \n",
" print(f\"\\nDistribution for {source_name}:\")\n",
" df_final = pd.DataFrame(list(final_distribution.items()), \n",
" columns=['Category', 'Count'])\n",
" df_final['Percentage'] = df_final['Count'] / len(balanced_data) * 100\n",
" print(df_final.sort_values('Count', ascending=False))\n",
" \n",
" print(f\"\\n{source_name} final size: {len(balanced_data)}\")\n",
" return balanced_data\n",
"\n",
"def merge_and_save_datasets(target_per_dataset=150):\n",
" func_single = load_dataset('json', data_files='hermes-function-calling-v1/func-calling-singleturn.json')\n",
" json_single = load_dataset('json', data_files='hermes-function-calling-v1/json-mode-singleturn.json')\n",
" \n",
" print(f\"Original func_single size: {len(func_single['train'])}\")\n",
" print(f\"Original json_single size: {len(json_single['train'])}\")\n",
" \n",
" func_balanced = downsample_and_tag_dataset(func_single, 'func_calling_singleturn', target_per_dataset)\n",
" json_balanced = downsample_and_tag_dataset(json_single, 'json_mode_singleturn', target_per_dataset)\n",
" \n",
" # Merge datasets\n",
" merged_data = func_balanced + json_balanced\n",
" \n",
" merged_dataset = Dataset.from_list(merged_data)\n",
" dataset_dict = DatasetDict({\n",
" 'train': merged_dataset\n",
" })\n",
"\n",
" print(\"\\nFinal merged dataset statistics:\")\n",
" print(f\"Total examples: {len(merged_data)}\")\n",
" print(f\"From func_calling_singleturn: {len(func_balanced)}\")\n",
" print(f\"From json_mode_singleturn: {len(json_balanced)}\")\n",
" \n",
" output_path = 'balanced_singleturn_merged'\n",
" dataset_dict.save_to_disk(output_path)\n",
" print(f\"\\nSaved merged dataset to {output_path}\")\n",
" \n",
" return dataset_dict\n",
"\n",
"# Run the merging process\n",
"merged_dataset = merge_and_save_datasets(150)"
]
},
{
"cell_type": "code",
"execution_count": 92,
"id": "7a737c6f-b2e2-4362-bc1c-2fff95b3488a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"DatasetDict({\n",
" train: Dataset({\n",
" features: ['id', 'conversations', 'category', 'subcategory', 'task', 'dataset_source'],\n",
" num_rows: 260\n",
" })\n",
"})"
]
},
"execution_count": 92,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"merged_dataset"
]
},
{
"cell_type": "markdown",
"id": "2d1d1591-f972-4d64-ac30-d3cafb402825",
"metadata": {},
"source": [
"## Glaive"
]
},
{
"cell_type": "code",
"execution_count": 95,
"id": "8f5b404f-4fad-42a0-8168-8b333cf21064",
"metadata": {},
"outputs": [],
"source": [
"def create_category_mapping():\n",
" \"\"\"Create mapping from original categories to new merged categories.\"\"\"\n",
" return {\n",
" # Technology & Computing\n",
" 'Technology': 'tech_computing',\n",
" 'Programming Concepts': 'tech_computing',\n",
" 'Programming and Computer Science Questions': 'tech_computing',\n",
" 'Web Development and Design': 'tech_computing',\n",
" 'Database and SQL': 'tech_computing',\n",
" 'Swift Programming': 'tech_computing',\n",
" 'Cybersecurity and Encryption': 'tech_computing',\n",
" \n",
" # Data & Analytics\n",
" 'Data Science': 'data_analytics',\n",
" 'Data Analysis and Programming': 'data_analytics',\n",
" 'Machine Learning': 'data_analytics',\n",
" 'Natural Language Processing': 'data_analytics',\n",
" \n",
" # Finance & Business\n",
" 'Stocks and Orders': 'finance_business',\n",
" 'Loan and Financial Calculations': 'finance_business',\n",
" 'Finance & Economics': 'finance_business',\n",
" 'Business Strategies': 'finance_business',\n",
" \n",
" # Science & Education\n",
" 'Science Education': 'science_education',\n",
" 'Science and Nature Exploration': 'science_education',\n",
" 'Quantum Physics': 'science_education',\n",
" 'Climate and Environmental Solutions': 'science_education',\n",
" \n",
" # Services & Productivity\n",
" 'Flight Services': 'services_productivity',\n",
" 'Location Services': 'services_productivity',\n",
" 'Productivity': 'services_productivity',\n",
" 'Request Management': 'services_productivity',\n",
" \n",
" # Knowledge & Culture\n",
" 'History and Culture': 'knowledge_culture',\n",
" 'Book Search': 'knowledge_culture',\n",
" 'Literary Analysis': 'knowledge_culture',\n",
" 'Language and Linguistics': 'knowledge_culture',\n",
" 'Language and Logic': 'knowledge_culture'\n",
" }\n",
"\n",
"def balance_dataset(dataset: List[Dict], target_size: int = 500) -> List[Dict]:\n",
" \"\"\"\n",
" Balance the dataset by:\n",
" 1. Merging categories\n",
" 2. Downsampling to achieve balanced distribution\n",
" 3. Ensuring total dataset size meets target\n",
" \"\"\"\n",
" category_mapping = create_category_mapping()\n",
" \n",
" # Group data by new categories\n",
" new_categories = defaultdict(list)\n",
" for item in dataset:\n",
" if item['category'] in category_mapping:\n",
" new_cat = category_mapping[item['category']]\n",
" new_categories[new_cat].append(item)\n",
" \n",
" # Calculate target size per category\n",
" n_categories = len(new_categories)\n",
" target_per_category = target_size // n_categories\n",
" \n",
" # Balance categories\n",
" balanced_data = []\n",
" for category, items in new_categories.items():\n",
" # Downsample if necessary\n",
" if len(items) > target_per_category:\n",
" sampled_items = random.sample(items, target_per_category)\n",
" else:\n",
" # If we have fewer items than target, use all of them\n",
" sampled_items = items\n",
" \n",
" # Update category name in items\n",
" for item in sampled_items:\n",
" item['category'] = category\n",
" balanced_data.append(item)\n",
" \n",
" return balanced_data\n",
"\n",
"def analyze_balanced_dataset(balanced_data: List[Dict]):\n",
" \"\"\"Print distribution of balanced dataset.\"\"\"\n",
" category_counts = defaultdict(int)\n",
" for item in balanced_data:\n",
" category_counts[item['category']] += 1\n",
" \n",
" print(\"\\n=== Balanced Dataset Distribution ===\")\n",
" for category, count in sorted(category_counts.items()):\n",
" print(f\"{category}: {count}\")\n",
" print(f\"\\nTotal samples: {sum(category_counts.values())}\")"
]
},
{
"cell_type": "code",
"execution_count": 96,
"id": "070c3c91-4cf8-4667-a2aa-c089c75c3a23",
"metadata": {},
"outputs": [],
"source": [
"balanced_data = balance_dataset(glaive_ds['train'])"
]
},
{
"cell_type": "code",
"execution_count": 97,
"id": "24bf99ca-e9b6-4aa9-952e-0c422c57da1e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"=== Balanced Dataset Distribution ===\n",
"data_analytics: 83\n",
"finance_business: 83\n",
"knowledge_culture: 83\n",
"science_education: 83\n",
"services_productivity: 83\n",
"tech_computing: 83\n",
"\n",
"Total samples: 498\n"
]
}
],
"source": [
"analyze_balanced_dataset(balanced_data)"
]
},
{
"cell_type": "code",
"execution_count": 99,
"id": "f2927eed-f46e-4486-9b57-df7601ccda1f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'id': '15d547ad-bc66-4b75-8d07-3258b424c023',\n",
" 'conversations': [{'from': 'system',\n",
" 'value': \"You are a function calling AI model. You are provided with function signatures within XML tags.You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions.Here are the available tools: [{'type': 'function', 'function': {'name': 'get_stock_price', 'description': 'Get the current stock price of a company', 'parameters': {'type': 'object', 'properties': {'company': {'type': 'string', 'description': 'The name of the company'}, 'symbol': {'type': 'string', 'description': 'The stock symbol of the company'}}, 'required': ['company', 'symbol']}}}, {'type': 'function', 'function': {'name': 'search_movies', 'description': 'Search for movies based on title or genre', 'parameters': {'type': 'object', 'properties': {'title': {'type': 'string', 'description': 'The title of the movie'}, 'genre': {'type': 'string', 'description': 'The genre of the movie'}}, 'required': []}}}] Use the following pydantic model json schema for each tool call you will make: {'title': 'FunctionCall', 'type': 'object', 'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name']}For each function call return a json object with function name and arguments within XML tags as follows:\\n\\n{tool_call}\\n\"},\n",
" {'from': 'human',\n",
" 'value': 'Hi, I would like to know the current stock price of Apple.'},\n",
" {'from': 'gpt',\n",
" 'value': 'Sure, I can help with that. Could you please provide me with the stock symbol of Apple?'},\n",
" {'from': 'human', 'value': 'Yes, the stock symbol for Apple is AAPL.'},\n",
" {'from': 'gpt',\n",
" 'value': \"\\n{'name': 'get_stock_price', 'arguments': {'company': 'Apple', 'symbol': 'AAPL'}}\\n\"},\n",
" {'from': 'tool',\n",
" 'value': \"\\n{'status': 'success', 'message': 'The current stock price of Apple (AAPL) is $150'}\\n\"},\n",
" {'from': 'gpt', 'value': 'The current stock price of Apple (AAPL) is $150.'},\n",
" {'from': 'human', 'value': 'Thank you for the information.'},\n",
" {'from': 'gpt',\n",
" 'value': \"You're welcome! If you have any other questions, feel free to ask.\"}],\n",
" 'tools': '[{\"type\": \"function\", \"function\": {\"name\": \"get_stock_price\", \"description\": \"Get the current stock price of a company\", \"parameters\": {\"type\": \"object\", \"properties\": {\"company\": {\"type\": \"string\", \"description\": \"The name of the company\"}, \"symbol\": {\"type\": \"string\", \"description\": \"The stock symbol of the company\"}}, \"required\": [\"company\", \"symbol\"]}}}, {\"type\": \"function\", \"function\": {\"name\": \"search_movies\", \"description\": \"Search for movies based on title or genre\", \"parameters\": {\"type\": \"object\", \"properties\": {\"title\": {\"type\": \"string\", \"description\": \"The title of the movie\"}, \"genre\": {\"type\": \"string\", \"description\": \"The genre of the movie\"}}, \"required\": []}}}]',\n",
" 'category': 'finance_business',\n",
" 'subcategory': None,\n",
" 'task': 'Stock price inquiries for Apple (AAPL)',\n",
" 'source': 'Glaive'}"
]
},
"execution_count": 99,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"balanced_data[24]"
]
},
{
"cell_type": "code",
"execution_count": 100,
"id": "0a4eabb6-dd9d-49e1-a6f9-73a76898bd74",
"metadata": {},
"outputs": [],
"source": [
"balanced_dataset = datasets.Dataset.from_list(balanced_data)"
]
},
{
"cell_type": "code",
"execution_count": 101,
"id": "9d1986b5-5b1a-445c-b684-22b2ca7d361b",
"metadata": {},
"outputs": [],
"source": [
"dataset_dict = datasets.DatasetDict({\n",
" 'train': balanced_dataset\n",
" })"
]
},
{
"cell_type": "code",
"execution_count": 103,
"id": "e80d2773-ac1e-443f-839c-82e8bddaad07",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "a4d7aa3f0a6043dc85590cd480e344e6",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Saving the dataset (0/1 shards): 0%| | 0/498 [00:00, ? examples/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"dataset_dict.save_to_disk(\"glaive-balanced\")"
]
},
{
"cell_type": "code",
"execution_count": 104,
"id": "03262560-c031-4ec2-b3f5-b468e792af38",
"metadata": {},
"outputs": [],
"source": [
"#fin"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.16"
}
},
"nbformat": 4,
"nbformat_minor": 5
}