{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "0978b512-1a88-447e-b80a-ee4b72152038",
   "metadata": {},
   "outputs": [],
   "source": [
    "!cd hermes-function-calling-v1\n",
    "!git lfs pull"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "id": "acb50ef7-fa53-438d-bce7-1eebf7bc0ba5",
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset, Dataset, DatasetDict\n",
    "from collections import defaultdict\n",
    "import pandas as pd\n",
    "import random\n",
    "import datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ec662394-2ae9-4081-87dd-bf75c4f76500",
   "metadata": {},
   "outputs": [],
   "source": [
    "def analyze_distribution(data):\n",
    "    category_counts = defaultdict(int)\n",
    "    subcategory_counts = defaultdict(int)\n",
    "    category_subcategory_counts = defaultdict(lambda: defaultdict(int))\n",
    "    \n",
    "    for item in data:\n",
    "        category = item['category']\n",
    "        subcategory = item['subcategory']\n",
    "        \n",
    "        category_counts[category] += 1\n",
    "        subcategory_counts[subcategory] += 1\n",
    "        category_subcategory_counts[category][subcategory] += 1\n",
    "    \n",
    "    return dict(category_counts), dict(subcategory_counts), dict(category_subcategory_counts)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "id": "63ff9ec7-f68a-4344-acf4-e99efbdd4e7c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load each dataset\n",
    "agentic = load_dataset('json', data_files='hermes-function-calling-v1/json-mode-agentic.json')\n",
    "singleturn = load_dataset('json', data_files='hermes-function-calling-v1/json-mode-singleturn.json')\n",
    "func_calling = load_dataset('json', data_files='hermes-function-calling-v1/func-calling.json')\n",
    "func_single = load_dataset('json', data_files='hermes-function-calling-v1/func-calling-singleturn.json')\n",
    "glaive_ds = load_dataset('json', data_files='hermes-function-calling-v1/glaive-function-calling-5k.json')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "id": "b4ded1b5-bc3c-4719-9b46-0a57f7169a6f",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_distributions(dataset, name):\n",
    "    cat_counts = defaultdict(int)\n",
    "    \n",
    "    for item in dataset['train']:\n",
    "        cat = item['category']\n",
    "        cat_counts[cat] += 1\n",
    "    \n",
    "    print(f\"\\n=== {name} Distribution ===\")\n",
    "    df_cat = pd.DataFrame(list(cat_counts.items()), columns=['Category', 'Count'])\n",
    "    df_cat['Percentage'] = df_cat['Count'] / df_cat['Count'].sum() * 100\n",
    "    print(df_cat.sort_values('Count', ascending=False))\n",
    "    return dict(cat_counts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "id": "cf2e0f61-2090-4691-a1c1-00d015e5fab6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Analyzing individual datasets:\n",
      "\n",
      "=== Agentic Distribution ===\n",
      "                             Category  Count  Percentage\n",
      "24                        DSPy Agents    457   34.053651\n",
      "26                         LLM Agents     93    6.929955\n",
      "8                    Simulacrum Agent     71    5.290611\n",
      "39                  Instructor Agents     68    5.067064\n",
      "23                   Agent Frameworks     64    4.769001\n",
      "11                        JSON Schema     59    4.396423\n",
      "0                     Simulacra Agent     59    4.396423\n",
      "10                 Copilot Frameworks     47    3.502235\n",
      "37                     Autogen Agents     45    3.353204\n",
      "41                  LlamaIndex Agents     44    3.278689\n",
      "33                   Langchain Agents     40    2.980626\n",
      "1                      Embodied Agent     38    2.831595\n",
      "29            Voyager MineCraft Agent     36    2.682563\n",
      "32                     Copilot Agents     31    2.309985\n",
      "35                   Guardrails Agent     28    2.086438\n",
      "18                   WebBrowser Agent     26    1.937407\n",
      "31                   FuncChain Agents     23    1.713860\n",
      "30                      AutoGPT Agent     21    1.564829\n",
      "16                    Minecraft Agent     18    1.341282\n",
      "15                   LangGraph Agents     15    1.117735\n",
      "25                    Outlines Agents     13    0.968703\n",
      "19                     Outlines Agent     13    0.968703\n",
      "12                      Coding Agents      5    0.372578\n",
      "21                    Guidance Agents      5    0.372578\n",
      "13                      Gollie Agents      4    0.298063\n",
      "27                       CrewAI Agent      3    0.223547\n",
      "9   AI-powered Task Management System      1    0.074516\n",
      "7                   AI Analysis Agent      1    0.074516\n",
      "6              Image Generation Agent      1    0.074516\n",
      "5                    Utility Function      1    0.074516\n",
      "3                     Data Structures      1    0.074516\n",
      "4                    Data Compression      1    0.074516\n",
      "2                      Data Structure      1    0.074516\n",
      "17                        Music Agent      1    0.074516\n",
      "22                             System      1    0.074516\n",
      "20                 CrewAI Integration      1    0.074516\n",
      "14                    BabyAGI Classic      1    0.074516\n",
      "28                   Autonomous Agent      1    0.074516\n",
      "36         CrewAI Agent Customization      1    0.074516\n",
      "34                Code Analysis Agent      1    0.074516\n",
      "38              File Management Agent      1    0.074516\n",
      "40            Multion API Interaction      1    0.074516\n",
      "\n",
      "=== Singleturn Distribution ===\n",
      "                                             Category  Count  Percentage\n",
      "3                                           Materials    147   11.845286\n",
      "18                      Financial Services (New Name)     93    7.493956\n",
      "22  Consumer Discretionary Distribution & Retail (...     92    7.413376\n",
      "1                                       Capital Goods     88    7.091056\n",
      "2                                      Transportation     86    6.929895\n",
      "5                         Consumer Durables & Apparel     79    6.365834\n",
      "6                  Commercial & Professional Services     75    6.043513\n",
      "7                                              Energy     67    5.398872\n",
      "21                   Health Care Equipment & Services     63    5.076551\n",
      "15                                  Consumer Services     52    4.190169\n",
      "10                           Food, Beverage & Tobacco     52    4.190169\n",
      "17                    Technology Hardware & Equipment     47    3.787268\n",
      "4                                 Software & Services     43    3.464948\n",
      "20                                          Insurance     40    3.223207\n",
      "16  Consumer Staples Distribution & Retail (New Name)     33    2.659146\n",
      "13                           Automobiles & Components     32    2.578566\n",
      "0      Pharmaceuticals, Biotechnology & Life Sciences     26    2.095085\n",
      "8                               Media & Entertainment     25    2.014504\n",
      "11                         Telecommunication Services     25    2.014504\n",
      "19                                              Banks     25    2.014504\n",
      "9                                         JSON Schema     19    1.531023\n",
      "12           Semiconductors & Semiconductor Equipment     17    1.369863\n",
      "14                      Household & Personal Products     15    1.208703\n",
      "\n",
      "=== Function Calling Distribution ===\n",
      "                            Category  Count  Percentage\n",
      "9             Information Extraction    796   42.049657\n",
      "47                          API Call    119    6.286318\n",
      "40               Industrial Software     65    3.433703\n",
      "24                Utilities Software     52    2.746962\n",
      "38  Robotic Process Automation (RPA)     51    2.694136\n",
      "..                               ...    ...         ...\n",
      "59                               SAP      2    0.105652\n",
      "18                  Document Ranking      1    0.052826\n",
      "20                         Marketing      1    0.052826\n",
      "45             Business Intelligence      1    0.052826\n",
      "50                   Topic Modelling      1    0.052826\n",
      "\n",
      "[63 rows x 3 columns]\n",
      "\n",
      "=== Function Calling Singleturn Distribution ===\n",
      "                            Category  Count  Percentage\n",
      "9             Information Extraction    796   42.049657\n",
      "47                          API Call    119    6.286318\n",
      "40               Industrial Software     65    3.433703\n",
      "24                Utilities Software     52    2.746962\n",
      "38  Robotic Process Automation (RPA)     51    2.694136\n",
      "..                               ...    ...         ...\n",
      "59                               SAP      2    0.105652\n",
      "18                  Document Ranking      1    0.052826\n",
      "20                         Marketing      1    0.052826\n",
      "45             Business Intelligence      1    0.052826\n",
      "50                   Topic Modelling      1    0.052826\n",
      "\n",
      "[63 rows x 3 columns]\n",
      "\n",
      "=== Glaive dataset balance Distribution ===\n",
      "                                      Category  Count  Percentage\n",
      "0                            Stocks and Orders    645   12.382415\n",
      "1                                        Movie    579   11.115377\n",
      "2                              Flight Services    551   10.577846\n",
      "3                           Request Management    547   10.501056\n",
      "4              Loan and Financial Calculations    429    8.235746\n",
      "5                            Location Services    357    6.853523\n",
      "6                                 Productivity    350    6.719140\n",
      "7                           Recipe Collections    313    6.008831\n",
      "8                                  Mathematics    156    2.994817\n",
      "9                             Machine Learning    132    2.534076\n",
      "10                         History and Culture    129    2.476483\n",
      "11                                 Book Search    126    2.418890\n",
      "12                                 Exploration    116    2.226915\n",
      "13                    Language and Linguistics     92    1.766174\n",
      "14                 Natural Language Processing     83    1.593396\n",
      "15               Data Analysis and Programming     79    1.516606\n",
      "16                  Web Development and Design     69    1.324630\n",
      "17              Science and Nature Exploration     49    0.940680\n",
      "18                            Database and SQL     43    0.825494\n",
      "19                                  Technology     42    0.806297\n",
      "20                         Business Strategies     41    0.787099\n",
      "21                        Programming Concepts     39    0.748704\n",
      "22                           Science Education     32    0.614321\n",
      "23                  Puzzle and Problem-Solving     27    0.518334\n",
      "24                          Language and Logic     27    0.518334\n",
      "25  Programming and Computer Science Questions     24    0.460741\n",
      "26         Climate and Environmental Solutions     24    0.460741\n",
      "27                           Literary Analysis     23    0.441543\n",
      "28                Cybersecurity and Encryption     21    0.403148\n",
      "29                                Data Science     20    0.383951\n",
      "30                         Finance & Economics     16    0.307161\n",
      "31                           Swift Programming     15    0.287963\n",
      "32                             Quantum Physics     13    0.249568\n"
     ]
    }
   ],
   "source": [
    "print(\"\\nAnalyzing individual datasets:\")\n",
    "agentic_dist = get_distributions(agentic, \"Agentic\")\n",
    "singleturn_dist = get_distributions(singleturn, \"Singleturn\")\n",
    "func_dist = get_distributions(func_calling, \"Function Calling\")\n",
    "func_single_dist = get_distributions(func_single, \"Function Calling Singleturn\")\n",
    "glaive_ds_dist = get_distributions(glaive_ds, \"Glaive dataset balance\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8cf021ad-830e-4756-af07-b1afd7db2f39",
   "metadata": {},
   "source": [
    "## lets start with agentic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2314e5a5-3f1c-432a-8a14-ccd36c0cbae4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Category mapping for merging\n",
    "CATEGORY_MAPPING = {\n",
    "    'Simulacrum Agent': 'Simulacra Agents',\n",
    "    'Simulacra Agent': 'Simulacra Agents',\n",
    "    \n",
    "    'Outlines Agents': 'Outlines Agents',\n",
    "    'Outlines Agent': 'Outlines Agents',\n",
    "    \n",
    "    'Minecraft Agent': 'Minecraft Agents',\n",
    "    'Voyager MineCraft Agent': 'Minecraft Agents',\n",
    "    \n",
    "    'Agent Frameworks': 'Development Frameworks',\n",
    "    'Copilot Frameworks': 'Development Frameworks',\n",
    "    \n",
    "    'AI Analysis Agent': 'Utility Agents',\n",
    "    'Code Analysis Agent': 'Utility Agents',\n",
    "    'File Management Agent': 'Utility Agents',\n",
    "    'Utility Function': 'Utility Agents',\n",
    "    'WebBrowser Agent': 'Utility Agents',\n",
    "    \n",
    "    'Data Structures': 'Data Processing Agents',\n",
    "    'Data Structure': 'Data Processing Agents',\n",
    "    'Data Compression': 'Data Processing Agents',\n",
    "    \n",
    "    'DSPy Agents': 'DSPy Agents',\n",
    "    'LLM Agents': 'LLM Agents',\n",
    "    'Instructor Agents': 'Instructor Agents',\n",
    "    'Autogen Agents': 'Autogen Agents',\n",
    "    'LlamaIndex Agents': 'LlamaIndex Agents',\n",
    "    'Langchain Agents': 'Langchain Agents',\n",
    "}\n",
    "DEFAULT_CATEGORY = 'Other Agents'\n",
    "\n",
    "def balance_agentic_dataset(target_size=25):\n",
    "    dataset = load_dataset('json', data_files='hermes-function-calling-v1/json-mode-agentic.json')\n",
    "    \n",
    "    category_groups = defaultdict(list)\n",
    "    for item in dataset['train']:\n",
    "        original_category = item['category']\n",
    "        mapped_category = CATEGORY_MAPPING.get(original_category, DEFAULT_CATEGORY)\n",
    "        category_groups[mapped_category].append(item)\n",
    "    \n",
    "    print(\"\\nOriginal distribution after category mapping:\")\n",
    "    for cat, items in category_groups.items():\n",
    "        print(f\"{cat}: {len(items)}\")\n",
    "    \n",
    "    # Balance dataset\n",
    "    balanced_data = []\n",
    "    for category, items in category_groups.items():\n",
    "        if len(items) > target_size:\n",
    "            sampled_items = random.sample(items, target_size)\n",
    "            balanced_data.extend(sampled_items)\n",
    "        else:\n",
    "            # Keep all items if less than target_size\n",
    "            balanced_data.extend(items)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "id": "13a4bbb3-30d2-42a2-b5ef-4c77fc7aa90f",
   "metadata": {},
   "outputs": [],
   "source": [
    "DEFAULT_CATEGORY = 'Other Agents'\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d7ebaf2d-fb69-4257-9879-0be534a9d03f",
   "metadata": {},
   "outputs": [],
   "source": [
    "def balance_agentic_dataset(target_size=25):\n",
    "    dataset = load_dataset('json', data_files='hermes-function-calling-v1/json-mode-agentic.json')\n",
    "    category_groups = defaultdict(list)\n",
    "    for item in dataset['train']:\n",
    "        original_category = item['category']\n",
    "        mapped_category = CATEGORY_MAPPING.get(original_category, DEFAULT_CATEGORY)\n",
    "        category_groups[mapped_category].append(item)\n",
    "        \n",
    "    print(\"\\nOriginal distribution after category mapping:\")\n",
    "    for cat, items in category_groups.items():\n",
    "        print(f\"{cat}: {len(items)}\")\n",
    "    \n",
    "    # Balance dataset\n",
    "    balanced_data = []\n",
    "    for category, items in category_groups.items():\n",
    "        if len(items) > target_size:\n",
    "            sampled_items = random.sample(items, target_size)\n",
    "            balanced_data.extend(sampled_items)\n",
    "        else:\n",
    "            balanced_data.extend(items)\n",
    "    \n",
    "    final_distribution = defaultdict(int)\n",
    "    for item in balanced_data:\n",
    "        mapped_category = CATEGORY_MAPPING.get(item['category'], DEFAULT_CATEGORY)\n",
    "        final_distribution[mapped_category] += 1\n",
    "    \n",
    "    print(\"\\nFinal distribution:\")\n",
    "    df_final = pd.DataFrame(list(final_distribution.items()), \n",
    "                          columns=['Category', 'Count'])\n",
    "    df_final['Percentage'] = df_final['Count'] / len(balanced_data) * 100\n",
    "    print(df_final.sort_values('Count', ascending=False))\n",
    "    \n",
    "    print(f\"\\nOriginal dataset size: {len(dataset['train'])}\")\n",
    "    print(f\"Balanced dataset size: {len(balanced_data)}\")\n",
    "    \n",
    "    return balanced_data\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "id": "f77af624-232a-4802-80f9-93fd458cc573",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Original distribution after category mapping:\n",
      "Simulacra Agents: 130\n",
      "Other Agents: 241\n",
      "Data Processing Agents: 3\n",
      "Utility Agents: 30\n",
      "Development Frameworks: 111\n",
      "Minecraft Agents: 54\n",
      "Outlines Agents: 26\n",
      "DSPy Agents: 457\n",
      "LLM Agents: 93\n",
      "Langchain Agents: 40\n",
      "Autogen Agents: 45\n",
      "Instructor Agents: 68\n",
      "LlamaIndex Agents: 44\n",
      "\n",
      "Final distribution:\n",
      "                  Category  Count  Percentage\n",
      "0         Simulacra Agents     25    8.250825\n",
      "1             Other Agents     25    8.250825\n",
      "3           Utility Agents     25    8.250825\n",
      "4   Development Frameworks     25    8.250825\n",
      "5         Minecraft Agents     25    8.250825\n",
      "9         Langchain Agents     25    8.250825\n",
      "6          Outlines Agents     25    8.250825\n",
      "7              DSPy Agents     25    8.250825\n",
      "8               LLM Agents     25    8.250825\n",
      "11       Instructor Agents     25    8.250825\n",
      "10          Autogen Agents     25    8.250825\n",
      "12       LlamaIndex Agents     25    8.250825\n",
      "2   Data Processing Agents      3    0.990099\n",
      "\n",
      "Original dataset size: 1342\n",
      "Balanced dataset size: 303\n"
     ]
    }
   ],
   "source": [
    "balanced_data = balance_agentic_dataset(25)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "id": "105933dd-5b05-41cf-8e61-cbe9abdc82de",
   "metadata": {},
   "outputs": [],
   "source": [
    "balanced_dataset = Dataset.from_list(balanced_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "id": "48674a73-227c-4a23-aa27-72bed628f0d9",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_dict = DatasetDict({\n",
    "        'train': balanced_dataset\n",
    "    })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "id": "0b9fd38b-c600-4f3e-9bcf-741340ae9218",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "718bda35da8b4d06a788be51d2234d8b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Saving the dataset (0/1 shards):   0%|          | 0/303 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "dataset_dict.save_to_disk(\"balanced-json-modeagentic\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "id": "e7c52093-c1ea-4a0d-a469-86571cca8784",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['id', 'conversations', 'category', 'subcategory', 'schema'],\n",
       "    num_rows: 303\n",
       "})"
      ]
     },
     "execution_count": 86,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset_dict['train']"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a7c8acd0-188b-4897-b8e3-57f4024428bb",
   "metadata": {},
   "source": [
    "## Func-Calling"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b879509f-c8ed-42b4-9edd-b3b0056d2196",
   "metadata": {},
   "outputs": [],
   "source": [
    "def balance_func_dataset(target_size=25):\n",
    "    dataset = load_dataset('json', data_files='hermes-function-calling-v1/func-calling.json')\n",
    "    \n",
    "    category_groups = defaultdict(list)\n",
    "    for item in dataset['train']:\n",
    "        category_groups[item['category']].append(item)\n",
    "    \n",
    "    print(\"\\nOriginal distribution:\")\n",
    "    for cat, items in category_groups.items():\n",
    "        print(f\"{cat}: {len(items)}\")\n",
    "    \n",
    "    # Balance dataset - cap at target_size but keep smaller categories as is\n",
    "    balanced_data = []\n",
    "    for category, items in category_groups.items():\n",
    "        if len(items) > target_size:\n",
    "            sampled_items = random.sample(items, target_size)\n",
    "            balanced_data.extend(sampled_items)\n",
    "        else:\n",
    "            balanced_data.extend(items)\n",
    "    \n",
    "    final_distribution = defaultdict(int)\n",
    "    for item in balanced_data:\n",
    "        final_distribution[item['category']] += 1\n",
    "    \n",
    "    print(\"\\nFinal distribution:\")\n",
    "    df_final = pd.DataFrame(list(final_distribution.items()), \n",
    "                          columns=['Category', 'Count'])\n",
    "    df_final['Percentage'] = df_final['Count'] / len(balanced_data) * 100\n",
    "    print(df_final.sort_values('Count', ascending=False))\n",
    "    \n",
    "    print(f\"\\nOriginal dataset size: {len(dataset['train'])}\")\n",
    "    print(f\"Balanced dataset size: {len(balanced_data)}\")\n",
    "    \n",
    "    return balanced_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d1fc2cd0-0cdf-4e04-baa0-b90bae27a76a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Original distribution:\n",
      "IoT and Home Automation: 15\n",
      "Quantum Computing: 8\n",
      "Services Industry Software: 17\n",
      "IoT Platforms: 8\n",
      "Communication Services Software: 37\n",
      "Materials Software: 9\n",
      "E-commerce Platforms: 23\n",
      "Blockchain Integration: 5\n",
      "Data Centers and High Performance Computing: 5\n",
      "Information Extraction: 796\n",
      "Data Privacy: 10\n",
      "Annotation: 4\n",
      "Model APIs: 23\n",
      "Project Management: 3\n",
      "Financial Services Apps: 29\n",
      "Decentralized Apps (DApps): 3\n",
      "Use Apps: 37\n",
      "Office Administration: 4\n",
      "Document Ranking: 1\n",
      "CRM: 3\n",
      "Marketing: 1\n",
      "Code Interpreters: 9\n",
      "Algorithmic Trading: 39\n",
      "Energy Software: 9\n",
      "Utilities Software: 52\n",
      "Information Technology Software: 35\n",
      "Data Processing: 19\n",
      "Database Management: 10\n",
      "Operating System Functions: 14\n",
      "Text Classification: 3\n",
      "Robotics and Automation: 8\n",
      "Networking and Cybersecurity: 6\n",
      "Consumer Discretionary Software: 25\n",
      "Named Entity Recognition: 8\n",
      "Information Retrieval (RAG): 20\n",
      "Productivity Tools Integration: 8\n",
      "Web APIs: 9\n",
      "Low-Code Enterprise Platforms: 19\n",
      "Robotic Process Automation (RPA): 51\n",
      "DevOps: 3\n",
      "Industrial Software: 65\n",
      "Voice Assistants: 10\n",
      "Healthcare Software: 40\n",
      "Desktop Applications: 33\n",
      "OpenAI API Integration: 29\n",
      "Business Intelligence: 1\n",
      "Financial Software: 46\n",
      "API Call: 119\n",
      "SaaS Platforms: 28\n",
      "Relation Extraction: 3\n",
      "Topic Modelling: 1\n",
      "Compliance: 3\n",
      "Consumer Staples Software: 29\n",
      "Real Estate Software: 9\n",
      "AI Model Integration: 12\n",
      "Mobile Applications: 34\n",
      "Web Browser Agent: 11\n",
      "Git Operations: 5\n",
      "Identity and Access Management (IAM): 3\n",
      "SAP: 2\n",
      "HR: 4\n",
      "Cloud Platforms: 16\n",
      "Accounting & Finance: 2\n",
      "\n",
      "Final distribution:\n",
      "                           Category  Count  Percentage\n",
      "4   Communication Services Software     25    3.052503\n",
      "14          Financial Services Apps     25    3.052503\n",
      "9            Information Extraction     25    3.052503\n",
      "24               Utilities Software     25    3.052503\n",
      "25  Information Technology Software     25    3.052503\n",
      "..                              ...    ...         ...\n",
      "59                              SAP      2    0.244200\n",
      "18                 Document Ranking      1    0.122100\n",
      "20                        Marketing      1    0.122100\n",
      "45            Business Intelligence      1    0.122100\n",
      "50                  Topic Modelling      1    0.122100\n",
      "\n",
      "[63 rows x 3 columns]\n",
      "\n",
      "Original dataset size: 1893\n",
      "Balanced dataset size: 819\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ad046fb8ab7943a7971785ef5c33f41b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Saving the dataset (0/1 shards):   0%|          | 0/819 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved balanced dataset to balanced_func_calling\n"
     ]
    }
   ],
   "source": [
    "def save_as_hf_dataset(balanced_data, output_path='balanced_func_calling'):\n",
    "    \"\"\"Save the balanced dataset as a Hugging Face dataset.\"\"\"\n",
    "    balanced_dataset = Dataset.from_list(balanced_data)\n",
    "    \n",
    "    dataset_dict = DatasetDict({\n",
    "        'train': balanced_dataset\n",
    "    })\n",
    "    \n",
    "    dataset_dict.save_to_disk(output_path)\n",
    "    print(f\"\\nSaved balanced dataset to {output_path}\")\n",
    "    return dataset_dict\n",
    "\n",
    "balanced_data = balance_func_dataset(25)\n",
    "dataset_dict = save_as_hf_dataset(balanced_data)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "82d2b786-a822-457f-96b5-e02e1730be01",
   "metadata": {},
   "source": [
    "## Merge both Single-Turn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a2d8eec4-97df-4abb-b248-e9236a777fd6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Original func_single size: 1893\n",
      "Original json_single size: 1241\n",
      "\n",
      "func_calling_singleturn:\n",
      "Number of categories: 63\n",
      "Samples per category: 2\n",
      "\n",
      "Distribution for func_calling_singleturn:\n",
      "                                Category  Count  Percentage\n",
      "0                IoT and Home Automation      2    1.639344\n",
      "1                      Quantum Computing      2    1.639344\n",
      "2             Services Industry Software      2    1.639344\n",
      "3                          IoT Platforms      2    1.639344\n",
      "4        Communication Services Software      2    1.639344\n",
      "..                                   ...    ...         ...\n",
      "58  Identity and Access Management (IAM)      2    1.639344\n",
      "18                      Document Ranking      1    0.819672\n",
      "20                             Marketing      1    0.819672\n",
      "45                 Business Intelligence      1    0.819672\n",
      "50                       Topic Modelling      1    0.819672\n",
      "\n",
      "[63 rows x 3 columns]\n",
      "\n",
      "func_calling_singleturn final size: 122\n",
      "\n",
      "json_mode_singleturn:\n",
      "Number of categories: 23\n",
      "Samples per category: 6\n",
      "\n",
      "Distribution for json_mode_singleturn:\n",
      "                                             Category  Count  Percentage\n",
      "0      Pharmaceuticals, Biotechnology & Life Sciences      6    4.347826\n",
      "1                                       Capital Goods      6    4.347826\n",
      "2                                      Transportation      6    4.347826\n",
      "3                                           Materials      6    4.347826\n",
      "4                                 Software & Services      6    4.347826\n",
      "5                         Consumer Durables & Apparel      6    4.347826\n",
      "6                  Commercial & Professional Services      6    4.347826\n",
      "7                                              Energy      6    4.347826\n",
      "8                               Media & Entertainment      6    4.347826\n",
      "9                                         JSON Schema      6    4.347826\n",
      "10                           Food, Beverage & Tobacco      6    4.347826\n",
      "11                         Telecommunication Services      6    4.347826\n",
      "12           Semiconductors & Semiconductor Equipment      6    4.347826\n",
      "13                           Automobiles & Components      6    4.347826\n",
      "14                      Household & Personal Products      6    4.347826\n",
      "15                                  Consumer Services      6    4.347826\n",
      "16  Consumer Staples Distribution & Retail (New Name)      6    4.347826\n",
      "17                    Technology Hardware & Equipment      6    4.347826\n",
      "18                      Financial Services (New Name)      6    4.347826\n",
      "19                                              Banks      6    4.347826\n",
      "20                                          Insurance      6    4.347826\n",
      "21                   Health Care Equipment & Services      6    4.347826\n",
      "22  Consumer Discretionary Distribution & Retail (...      6    4.347826\n",
      "\n",
      "json_mode_singleturn final size: 138\n",
      "\n",
      "Final merged dataset statistics:\n",
      "Total examples: 260\n",
      "From func_calling_singleturn: 122\n",
      "From json_mode_singleturn: 138\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "3d8cb0044406475cb114f1f6559c93ab",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Saving the dataset (0/1 shards):   0%|          | 0/260 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Saved merged dataset to balanced_singleturn_merged\n"
     ]
    }
   ],
   "source": [
    "from datasets import load_dataset, Dataset, DatasetDict\n",
    "import random\n",
    "from collections import defaultdict\n",
    "import pandas as pd\n",
    "import math\n",
    "\n",
    "def downsample_and_tag_dataset(dataset, source_name, target_total=150):\n",
    "    category_groups = defaultdict(list)\n",
    "    for item in dataset['train']:\n",
    "        category_groups[item['category']].append(item)\n",
    "    \n",
    "    num_categories = len(category_groups)\n",
    "    samples_per_category = max(1, math.floor(target_total / num_categories))\n",
    "    \n",
    "    print(f\"\\n{source_name}:\")\n",
    "    print(f\"Number of categories: {num_categories}\")\n",
    "    print(f\"Samples per category: {samples_per_category}\")\n",
    "    \n",
    "    balanced_data = []\n",
    "    for category, items in category_groups.items():\n",
    "        if len(items) > samples_per_category:\n",
    "            sampled_items = random.sample(items, samples_per_category)\n",
    "            balanced_data.extend(sampled_items)\n",
    "        else:\n",
    "            balanced_data.extend(items)\n",
    "    \n",
    "    # Add source tag to each example\n",
    "    for item in balanced_data:\n",
    "        item['dataset_source'] = source_name\n",
    "    \n",
    "    final_distribution = defaultdict(int)\n",
    "    for item in balanced_data:\n",
    "        final_distribution[item['category']] += 1\n",
    "    \n",
    "    print(f\"\\nDistribution for {source_name}:\")\n",
    "    df_final = pd.DataFrame(list(final_distribution.items()), \n",
    "                          columns=['Category', 'Count'])\n",
    "    df_final['Percentage'] = df_final['Count'] / len(balanced_data) * 100\n",
    "    print(df_final.sort_values('Count', ascending=False))\n",
    "    \n",
    "    print(f\"\\n{source_name} final size: {len(balanced_data)}\")\n",
    "    return balanced_data\n",
    "\n",
    "def merge_and_save_datasets(target_per_dataset=150):\n",
    "    func_single = load_dataset('json', data_files='hermes-function-calling-v1/func-calling-singleturn.json')\n",
    "    json_single = load_dataset('json', data_files='hermes-function-calling-v1/json-mode-singleturn.json')\n",
    "    \n",
    "    print(f\"Original func_single size: {len(func_single['train'])}\")\n",
    "    print(f\"Original json_single size: {len(json_single['train'])}\")\n",
    "    \n",
    "    func_balanced = downsample_and_tag_dataset(func_single, 'func_calling_singleturn', target_per_dataset)\n",
    "    json_balanced = downsample_and_tag_dataset(json_single, 'json_mode_singleturn', target_per_dataset)\n",
    "    \n",
    "    # Merge datasets\n",
    "    merged_data = func_balanced + json_balanced\n",
    "    \n",
    "    merged_dataset = Dataset.from_list(merged_data)\n",
    "    dataset_dict = DatasetDict({\n",
    "        'train': merged_dataset\n",
    "    })\n",
    "\n",
    "    print(\"\\nFinal merged dataset statistics:\")\n",
    "    print(f\"Total examples: {len(merged_data)}\")\n",
    "    print(f\"From func_calling_singleturn: {len(func_balanced)}\")\n",
    "    print(f\"From json_mode_singleturn: {len(json_balanced)}\")\n",
    "    \n",
    "    output_path = 'balanced_singleturn_merged'\n",
    "    dataset_dict.save_to_disk(output_path)\n",
    "    print(f\"\\nSaved merged dataset to {output_path}\")\n",
    "    \n",
    "    return dataset_dict\n",
    "\n",
    "# Run the merging process\n",
    "merged_dataset = merge_and_save_datasets(150)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "id": "7a737c6f-b2e2-4362-bc1c-2fff95b3488a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['id', 'conversations', 'category', 'subcategory', 'task', 'dataset_source'],\n",
       "        num_rows: 260\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 92,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "merged_dataset"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2d1d1591-f972-4d64-ac30-d3cafb402825",
   "metadata": {},
   "source": [
    "## Glaive"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "id": "8f5b404f-4fad-42a0-8168-8b333cf21064",
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_category_mapping():\n",
    "    \"\"\"Create mapping from original categories to new merged categories.\"\"\"\n",
    "    return {\n",
    "        # Technology & Computing\n",
    "        'Technology': 'tech_computing',\n",
    "        'Programming Concepts': 'tech_computing',\n",
    "        'Programming and Computer Science Questions': 'tech_computing',\n",
    "        'Web Development and Design': 'tech_computing',\n",
    "        'Database and SQL': 'tech_computing',\n",
    "        'Swift Programming': 'tech_computing',\n",
    "        'Cybersecurity and Encryption': 'tech_computing',\n",
    "        \n",
    "        # Data & Analytics\n",
    "        'Data Science': 'data_analytics',\n",
    "        'Data Analysis and Programming': 'data_analytics',\n",
    "        'Machine Learning': 'data_analytics',\n",
    "        'Natural Language Processing': 'data_analytics',\n",
    "        \n",
    "        # Finance & Business\n",
    "        'Stocks and Orders': 'finance_business',\n",
    "        'Loan and Financial Calculations': 'finance_business',\n",
    "        'Finance & Economics': 'finance_business',\n",
    "        'Business Strategies': 'finance_business',\n",
    "        \n",
    "        # Science & Education\n",
    "        'Science Education': 'science_education',\n",
    "        'Science and Nature Exploration': 'science_education',\n",
    "        'Quantum Physics': 'science_education',\n",
    "        'Climate and Environmental Solutions': 'science_education',\n",
    "        \n",
    "        # Services & Productivity\n",
    "        'Flight Services': 'services_productivity',\n",
    "        'Location Services': 'services_productivity',\n",
    "        'Productivity': 'services_productivity',\n",
    "        'Request Management': 'services_productivity',\n",
    "        \n",
    "        # Knowledge & Culture\n",
    "        'History and Culture': 'knowledge_culture',\n",
    "        'Book Search': 'knowledge_culture',\n",
    "        'Literary Analysis': 'knowledge_culture',\n",
    "        'Language and Linguistics': 'knowledge_culture',\n",
    "        'Language and Logic': 'knowledge_culture'\n",
    "    }\n",
    "\n",
    "def balance_dataset(dataset: List[Dict], target_size: int = 500) -> List[Dict]:\n",
    "    \"\"\"\n",
    "    Balance the dataset by:\n",
    "    1. Merging categories\n",
    "    2. Downsampling to achieve balanced distribution\n",
    "    3. Ensuring total dataset size meets target\n",
    "    \"\"\"\n",
    "    category_mapping = create_category_mapping()\n",
    "    \n",
    "    # Group data by new categories\n",
    "    new_categories = defaultdict(list)\n",
    "    for item in dataset:\n",
    "        if item['category'] in category_mapping:\n",
    "            new_cat = category_mapping[item['category']]\n",
    "            new_categories[new_cat].append(item)\n",
    "    \n",
    "    # Calculate target size per category\n",
    "    n_categories = len(new_categories)\n",
    "    target_per_category = target_size // n_categories\n",
    "    \n",
    "    # Balance categories\n",
    "    balanced_data = []\n",
    "    for category, items in new_categories.items():\n",
    "        # Downsample if necessary\n",
    "        if len(items) > target_per_category:\n",
    "            sampled_items = random.sample(items, target_per_category)\n",
    "        else:\n",
    "            # If we have fewer items than target, use all of them\n",
    "            sampled_items = items\n",
    "            \n",
    "        # Update category name in items\n",
    "        for item in sampled_items:\n",
    "            item['category'] = category\n",
    "            balanced_data.append(item)\n",
    "    \n",
    "    return balanced_data\n",
    "\n",
    "def analyze_balanced_dataset(balanced_data: List[Dict]):\n",
    "    \"\"\"Print distribution of balanced dataset.\"\"\"\n",
    "    category_counts = defaultdict(int)\n",
    "    for item in balanced_data:\n",
    "        category_counts[item['category']] += 1\n",
    "    \n",
    "    print(\"\\n=== Balanced Dataset Distribution ===\")\n",
    "    for category, count in sorted(category_counts.items()):\n",
    "        print(f\"{category}: {count}\")\n",
    "    print(f\"\\nTotal samples: {sum(category_counts.values())}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "id": "070c3c91-4cf8-4667-a2aa-c089c75c3a23",
   "metadata": {},
   "outputs": [],
   "source": [
    "balanced_data = balance_dataset(glaive_ds['train'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "id": "24bf99ca-e9b6-4aa9-952e-0c422c57da1e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "=== Balanced Dataset Distribution ===\n",
      "data_analytics: 83\n",
      "finance_business: 83\n",
      "knowledge_culture: 83\n",
      "science_education: 83\n",
      "services_productivity: 83\n",
      "tech_computing: 83\n",
      "\n",
      "Total samples: 498\n"
     ]
    }
   ],
   "source": [
    "analyze_balanced_dataset(balanced_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "id": "f2927eed-f46e-4486-9b57-df7601ccda1f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'id': '15d547ad-bc66-4b75-8d07-3258b424c023',\n",
       " 'conversations': [{'from': 'system',\n",
       "   'value': \"You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags.You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions.Here are the available tools:<tools> [{'type': 'function', 'function': {'name': 'get_stock_price', 'description': 'Get the current stock price of a company', 'parameters': {'type': 'object', 'properties': {'company': {'type': 'string', 'description': 'The name of the company'}, 'symbol': {'type': 'string', 'description': 'The stock symbol of the company'}}, 'required': ['company', 'symbol']}}}, {'type': 'function', 'function': {'name': 'search_movies', 'description': 'Search for movies based on title or genre', 'parameters': {'type': 'object', 'properties': {'title': {'type': 'string', 'description': 'The title of the movie'}, 'genre': {'type': 'string', 'description': 'The genre of the movie'}}, 'required': []}}}] </tools>Use the following pydantic model json schema for each tool call you will make: {'title': 'FunctionCall', 'type': 'object', 'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name']}For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:\\n<tool_call>\\n{tool_call}\\n</tool_call>\"},\n",
       "  {'from': 'human',\n",
       "   'value': 'Hi, I would like to know the current stock price of Apple.'},\n",
       "  {'from': 'gpt',\n",
       "   'value': 'Sure, I can help with that. Could you please provide me with the stock symbol of Apple?'},\n",
       "  {'from': 'human', 'value': 'Yes, the stock symbol for Apple is AAPL.'},\n",
       "  {'from': 'gpt',\n",
       "   'value': \"<tool_call>\\n{'name': 'get_stock_price', 'arguments': {'company': 'Apple', 'symbol': 'AAPL'}}\\n</tool_call>\"},\n",
       "  {'from': 'tool',\n",
       "   'value': \"<tool_response>\\n{'status': 'success', 'message': 'The current stock price of Apple (AAPL) is $150'}\\n</tool_response>\"},\n",
       "  {'from': 'gpt', 'value': 'The current stock price of Apple (AAPL) is $150.'},\n",
       "  {'from': 'human', 'value': 'Thank you for the information.'},\n",
       "  {'from': 'gpt',\n",
       "   'value': \"You're welcome! If you have any other questions, feel free to ask.\"}],\n",
       " 'tools': '[{\"type\": \"function\", \"function\": {\"name\": \"get_stock_price\", \"description\": \"Get the current stock price of a company\", \"parameters\": {\"type\": \"object\", \"properties\": {\"company\": {\"type\": \"string\", \"description\": \"The name of the company\"}, \"symbol\": {\"type\": \"string\", \"description\": \"The stock symbol of the company\"}}, \"required\": [\"company\", \"symbol\"]}}}, {\"type\": \"function\", \"function\": {\"name\": \"search_movies\", \"description\": \"Search for movies based on title or genre\", \"parameters\": {\"type\": \"object\", \"properties\": {\"title\": {\"type\": \"string\", \"description\": \"The title of the movie\"}, \"genre\": {\"type\": \"string\", \"description\": \"The genre of the movie\"}}, \"required\": []}}}]',\n",
       " 'category': 'finance_business',\n",
       " 'subcategory': None,\n",
       " 'task': 'Stock price inquiries for Apple (AAPL)',\n",
       " 'source': 'Glaive'}"
      ]
     },
     "execution_count": 99,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "balanced_data[24]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "id": "0a4eabb6-dd9d-49e1-a6f9-73a76898bd74",
   "metadata": {},
   "outputs": [],
   "source": [
    "balanced_dataset = datasets.Dataset.from_list(balanced_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "id": "9d1986b5-5b1a-445c-b684-22b2ca7d361b",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_dict = datasets.DatasetDict({\n",
    "        'train': balanced_dataset\n",
    "    })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "id": "e80d2773-ac1e-443f-839c-82e8bddaad07",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a4d7aa3f0a6043dc85590cd480e344e6",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Saving the dataset (0/1 shards):   0%|          | 0/498 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "dataset_dict.save_to_disk(\"glaive-balanced\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "id": "03262560-c031-4ec2-b3f5-b468e792af38",
   "metadata": {},
   "outputs": [],
   "source": [
    "#fin"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}