{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "96a4cb34-d8da-4b16-a929-98bfae2ac668",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Updated Git hooks.\n",
      "Git LFS initialized.\n"
     ]
    }
   ],
   "source": [
    "!git lfs install"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "0978b512-1a88-447e-b80a-ee4b72152038",
   "metadata": {},
   "outputs": [],
   "source": [
    "!cd hermes-function-calling-v1\n",
    "!git lfs pull"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "4910830d-f346-414e-a2e2-9be3e1695cab",
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset, load_from_disk\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from collections import Counter, defaultdict\n",
    "import json\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import re\n",
    "from tqdm import tqdm\n",
    "import networkx as nx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "29af79d6-8f06-4483-96c9-749c965e1d95",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "func-calling.json\t\t json-mode-agentic.json\n",
      "func-calling-singleturn.json\t json-mode-singleturn.json\n",
      "glaive-function-calling-5k.json  README.md\n"
     ]
    }
   ],
   "source": [
    "!ls hermes-function-calling-v1/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "ae877e54-6c13-4988-bb7c-0b27cc156662",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "598e4023888740cfa97cb73566d1a073",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating train split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "dataset = load_dataset('json', data_files='hermes-function-calling-v1/func-calling.json')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "fd2bc49f-278d-4078-b1bf-07a8fc7e9c05",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['id', 'conversations', 'category', 'subcategory', 'task'],\n",
       "    num_rows: 1893\n",
       "})"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset['train']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "4879906d-a204-4062-b09d-64ff21649327",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_keys(['id', 'conversations', 'category', 'subcategory', 'task'])"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset['train'][123].keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "e03129e2-7bcf-43b7-b99e-9526dce9072b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'id': '3bac4895-1a36-4380-8ef7-c36abfcc4ba4',\n",
       " 'conversations': [{'from': 'system',\n",
       "   'value': \"You are a function calling AI model. You are provided with function signatures within <tools> </tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions.\\n<tools>\\n[{'type': 'function', 'function': {'name': 'scale_up_cluster', 'description': 'Scales up the number of compute nodes in an HPC cluster.', 'parameters': {'type': 'object', 'properties': {'cluster_name': {'type': 'string', 'description': 'The name of the HPC cluster to scale up.'}, 'node_type': {'type': 'string', 'description': 'The type of compute node to add to the cluster.'}, 'additional_nodes': {'type': 'integer', 'description': 'The number of additional compute nodes to add to the cluster.'}}, 'required': ['cluster_name', 'node_type', 'additional_nodes']}}}, {'type': 'function', 'function': {'name': 'scale_down_cluster', 'description': 'Scales down the number of compute nodes in an HPC cluster.', 'parameters': {'type': 'object', 'properties': {'cluster_name': {'type': 'string', 'description': 'The name of the HPC cluster to scale down.'}, 'node_type': {'type': 'string', 'description': 'The type of compute node to remove from the cluster.'}, 'remove_nodes': {'type': 'integer', 'description': 'The number of compute nodes to remove from the cluster.'}}, 'required': ['cluster_name', 'node_type', 'remove_nodes']}}}]\\n</tools>\\nFor each function call return a json object with function name and arguments within <tool_call> </tool_call> tags with the following schema:\\n<tool_call>\\n{'arguments': <args-dict>, 'name': <function-name>}\\n</tool_call>\\n\"},\n",
       "  {'from': 'human',\n",
       "   'value': \"Our High Performance Computing (HPC) cluster, designated as 'CFD-Sim-Cluster', is engaged in complex computational fluid dynamics simulations. With an influx of computational tasks forecasted for the upcoming week, we need to proactively expand our cluster's capacity. Therefore, I request the addition of 50 new compute nodes, specifically of the 'c5.18xlarge' type, to handle the anticipated surge in processing demands. Once the simulations have been concluded and the need for augmented resources subsides, we will need to scale back by decommissioning the same number of nodes. Could you facilitate the necessary API calls to scale our HPC cluster up and subsequently down as per the described requirements?\\n\\nHere are the details for the API calls:\\n1. Scale up the cluster 'CFD-Sim-Cluster' by adding 50 compute nodes of type 'c5.18xlarge'.\\n2. After completion of the high-demand period, scale down the 'CFD-Sim-Cluster' by removing 50 nodes of type 'c5.18xlarge'.\\n\\nI would like these API calls to be initiated to adjust our cluster's resources accordingly.\"},\n",
       "  {'from': 'gpt',\n",
       "   'value': \"<tool_call>\\n{'arguments': {'cluster_name': 'CFD-Sim-Cluster', 'node_type': 'c5.18xlarge', 'additional_nodes': 50}, 'name': 'scale_up_cluster'}\\n</tool_call>\\n<tool_call>\\n{'arguments': {'cluster_name': 'CFD-Sim-Cluster', 'node_type': 'c5.18xlarge', 'remove_nodes': 50}, 'name': 'scale_down_cluster'}\\n</tool_call>\\n\"},\n",
       "  {'from': 'tool',\n",
       "   'value': '<tool_response>\\n{\"name\": \"scale_up_cluster\", \"content\": {\"cluster_name\": \"CFD-Sim-Cluster\", \"node_type\": \"c5.18xlarge\", \"additional_nodes\": 50, \"initiated_at\": \"2023-04-10T08:00:00Z\", \"expected_completion\": \"2023-04-10T12:00:00Z\", \"status\": \"scaling initiated\", \"additional_info\": {\"current_node_count\": 100, \"target_node_count\": 150, \"estimated_cost_increase\": \"$10,000/month\", \"region\": \"us-west-2\"}}}\\n</tool_response>\\n<tool_response>\\n{\"name\": \"scale_down_cluster\", \"content\": {\"cluster_name\": \"CFD-Sim-Cluster\", \"node_type\": \"c5.18xlarge\", \"remove_nodes\": 50, \"initiated_at\": \"2023-05-10T08:00:00Z\", \"expected_completion\": \"2023-05-10T12:00:00Z\", \"status\": \"scaling initiated\", \"additional_info\": {\"current_node_count\": 150, \"target_node_count\": 100, \"estimated_cost_savings\": \"$10,000/month\", \"region\": \"us-west-2\"}}}\\n</tool_response>\\n'},\n",
       "  {'from': 'gpt',\n",
       "   'value': \"The necessary API calls to adjust the resources of the 'CFD-Sim-Cluster' have been initiated as per your requirements.\\n\\n1. The first call to **scale up** the cluster by adding 50 'c5.18xlarge' compute nodes was initiated on April 10, 2023, at 08:00 AM (timezone unspecified) and is expected to complete by 12:00 PM the same day. The cluster will expand from 100 to 150 nodes, which is estimated to increase costs by $10,000 per month. The operation is taking place in the 'us-west-2' region.\\n\\n2. The second call to **scale down** the cluster by removing 50 'c5.18xlarge' compute nodes is set to be initiated a month later, on May 10, 2023, at 08:00 AM, with an expected completion time of 12:00 PM on the same day. Following the scaling down, the cluster will return to 100 nodes, resulting in estimated cost savings of $10,000 per month. This operation will also occur in the 'us-west-2' region.\\n\\nPlease note that these operations are currently set to occur automatically based on the dates mentioned, therefore no further action is required at this moment to execute them. If you need to adjust the timings or other details, please let me know in advance so that the necessary changes can be made.\"}],\n",
       " 'category': 'Data Centers and High Performance Computing',\n",
       " 'subcategory': 'Cluster Scaling',\n",
       " 'task': 'Scale High Performance Computing (HPC) Systems Up or Down with API Calls (POST Requests)'}"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset['train'][123]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6506a240-e3f3-4dbe-8145-db92974efc83",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = load_dataset('json', data_files='hermes-function-calling-v1/func-calling.json')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "021f6bd4-ed02-49b7-83fe-33a96ec42156",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'id': 'f9b15d86-fb69-4f05-a65e-18f2f7b5369d',\n",
       " 'conversations': [{'from': 'system',\n",
       "   'value': \"You are a function calling AI model. You are provided with function signatures within <tools> </tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions.\\n<tools>\\n[{'type': 'function', 'function': {'name': 'min_max_normalize', 'description': 'Apply Min-Max normalization to scale features in the dataset to a specified range.', 'parameters': {'type': 'object', 'properties': {'data': {'type': 'string', 'description': 'The path to the dataset file.'}, 'feature_range': {'type': 'object', 'properties': {'min': {'type': 'number', 'description': 'The minimum value of the target range for normalization.'}, 'max': {'type': 'number', 'description': 'The maximum value of the target range for normalization.'}}, 'required': ['min', 'max']}}, 'required': ['data', 'feature_range']}}}, {'type': 'function', 'function': {'name': 'z_score_normalize', 'description': 'Apply Z-Score normalization to standardize features in the dataset to have a mean of 0 and a standard deviation of 1.', 'parameters': {'type': 'object', 'properties': {'data': {'type': 'string', 'description': 'The path to the dataset file.'}}, 'required': ['data']}}}, {'type': 'function', 'function': {'name': 'decimal_scaling_normalize', 'description': 'Apply Decimal Scaling normalization to shift the decimal point of values of the data.', 'parameters': {'type': 'object', 'properties': {'data': {'type': 'string', 'description': 'The path to the dataset file.'}}, 'required': ['data']}}}]\\n</tools>\\nFor each function call return a json object with function name and arguments within <tool_call> </tool_call> tags with the following schema:\\n<tool_call>\\n{'arguments': <args-dict>, 'name': <function-name>}\\n</tool_call>\\n\"},\n",
       "  {'from': 'human',\n",
       "   'value': \"I have a dataset, `dataset_normalized.csv`, which contains several features for a machine learning project. However, this data encompasses features with heterogeneous scales and units, which may hinder the predictive model's performance. To ensure that each feature has an equal impact on the model, the features need to be normalized to the same scale. I would like to perform Min-Max normalization on this data to adjust the features to a scale between 0 and 1. Here is the dataset for reference:\\n\\n```markdown\\n| Feature1 | Feature2 | Feature3 | ... | FeatureN |\\n|----------|----------|----------|-----|----------|\\n| 15       | 100      | 0.75     | ... | 40       |\\n| 22       | 80       | 0.85     | ... | 35       |\\n| 18       | 90       | 0.65     | ... | 38       |\\n| ...      | ...      | ...      | ... | ...      |\\n| 20       | 95       | 0.70     | ... | 37       |\\n```\\n\\nCould you please apply Min-Max normalization using the `min_max_normalize` function to scale the features within the range of 0 to 1 for the `dataset_normalized.csv` file? This preprocessing will ensure that our machine learning algorithm evaluates all features fairly.\"},\n",
       "  {'from': 'gpt',\n",
       "   'value': \"<tool_call>\\n{'arguments': {'data': 'dataset_normalized.csv', 'feature_range': {'min': 0, 'max': 1}}, 'name': 'min_max_normalize'}\\n</tool_call>\\n\"}],\n",
       " 'category': 'Data Processing',\n",
       " 'subcategory': 'Data Normalization',\n",
       " 'task': 'Data normalization methods'}"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "d['train'][400]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "60426c28-10f6-41f3-b378-ace13a2faf6c",
   "metadata": {},
   "source": [
    "So we need to address a few things: \n",
    "\n",
    "Mapping:\n",
    "System is fine but we need to change the function and also the XML tag nonsense\n",
    "\n",
    "Human -> User\n",
    "\n",
    "GPT -> Assistant"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "7be0e5e3-4bd8-419a-8fe2-2c775c852753",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "64dbc269b5194effa9111c35a6c51b29",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/1893 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "def process_dataset(dataset):\n",
    "   def replace_terms(data):\n",
    "       if isinstance(data, str):\n",
    "           return data.replace('human', 'user').replace('gpt', 'assistant')\n",
    "       elif isinstance(data, list):\n",
    "           return [replace_terms(item) for item in data]\n",
    "       elif isinstance(data, dict):\n",
    "           return {k: replace_terms(v) for k, v in data.items()}\n",
    "       return data\n",
    "   \n",
    "   return dataset.map(lambda x: {k: replace_terms(v) for k, v in x.items()})\n",
    "\n",
    "processed_dataset = process_dataset(d)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "30420c48-c62b-473d-8e61-6f100218e092",
   "metadata": {},
   "outputs": [],
   "source": [
    "d = processed_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "1f38c4f4-bdf0-451c-96cf-76ceca840e8b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'id': '936b90da-dbeb-4864-a6e6-28899965265d',\n",
       " 'conversations': [{'from': 'system',\n",
       "   'value': \"You are a function calling AI model. You are provided with function signatures within <tools> </tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions.\\n<tools>\\n[{'type': 'function', 'function': {'name': 'scale_up_cluster', 'description': 'Scales up the number of compute nodes in an HPC cluster.', 'parameters': {'type': 'object', 'properties': {'cluster_name': {'type': 'string', 'description': 'The name of the HPC cluster to scale up.'}, 'node_type': {'type': 'string', 'description': 'The type of compute node to add to the cluster.'}, 'additional_nodes': {'type': 'integer', 'description': 'The number of additional compute nodes to add to the cluster.'}}, 'required': ['cluster_name', 'node_type', 'additional_nodes']}}}, {'type': 'function', 'function': {'name': 'scale_down_cluster', 'description': 'Scales down the number of compute nodes in an HPC cluster.', 'parameters': {'type': 'object', 'properties': {'cluster_name': {'type': 'string', 'description': 'The name of the HPC cluster to scale down.'}, 'node_type': {'type': 'string', 'description': 'The type of compute node to remove from the cluster.'}, 'remove_nodes': {'type': 'integer', 'description': 'The number of compute nodes to remove from the cluster.'}}, 'required': ['cluster_name', 'node_type', 'remove_nodes']}}}]\\n</tools>\\nFor each function call return a json object with function name and arguments within <tool_call> </tool_call> tags with the following schema:\\n<tool_call>\\n{'arguments': <args-dict>, 'name': <function-name>}\\n</tool_call>\\n\"},\n",
       "  {'from': 'user',\n",
       "   'value': \"Our High Performance Computing (HPC) cluster, designated as 'CFD-Sim-Cluster', is engaged in complex computational fluid dynamics simulations. With an influx of computational tasks forecasted for the upcoming week, we need to proactively expand our cluster's capacity. Therefore, I request the addition of 50 new compute nodes, specifically of the 'c5.18xlarge' type, to handle the anticipated surge in processing demands. Once the simulations have been concluded and the need for augmented resources subsides, we will need to scale back by decommissioning the same number of nodes. Could you facilitate the necessary API calls to scale our HPC cluster up and subsequently down as per the described requirements?\\n\\nHere are the details for the API calls:\\n1. Scale up the cluster 'CFD-Sim-Cluster' by adding 50 compute nodes of type 'c5.18xlarge'.\\n2. After completion of the high-demand period, scale down the 'CFD-Sim-Cluster' by removing 50 nodes of type 'c5.18xlarge'.\\n\\nI would like these API calls to be initiated to adjust our cluster's resources accordingly.\"},\n",
       "  {'from': 'assistant',\n",
       "   'value': \"<tool_call>\\n{'arguments': {'cluster_name': 'CFD-Sim-Cluster', 'node_type': 'c5.18xlarge', 'additional_nodes': 50}, 'name': 'scale_up_cluster'}\\n</tool_call>\\n<tool_call>\\n{'arguments': {'cluster_name': 'CFD-Sim-Cluster', 'node_type': 'c5.18xlarge', 'remove_nodes': 50}, 'name': 'scale_down_cluster'}\\n</tool_call>\\n\"}],\n",
       " 'category': 'Data Centers and High Performance Computing',\n",
       " 'subcategory': 'Cluster Scaling',\n",
       " 'task': 'Scale High Performance Computing (HPC) Systems Up or Down with API Calls (POST Requests)'}"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "d['train'][123]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "6662be80-8afb-419b-bc7f-8db07fae578a",
   "metadata": {},
   "outputs": [],
   "source": [
    "NEW_SYSTEM_MESSAGE = \"\"\"You are an expert in composing functions. You are given a question and a set of possible functions.\n",
    "Based on the question, you will need to make one or more function/tool calls to achieve the purpose.\n",
    "If none of the function can be used, point it out. If the given question lacks the parameters required by the function,\n",
    "also point it out. You should only return the function call in tools call sections.\n",
    "If you decide to invoke any of the function(s), you MUST put it in the format of [func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)]\n",
    "You SHOULD NOT include any other text in the response.\n",
    "Here is a list of functions in JSON format that you can invoke.\"\"\""
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}