{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "572d0ab1-36be-4ed3-be72-c316c7f7f142",
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset, load_from_disk, concatenate_datasets\n",
    "from datasets import Dataset, DatasetDict\n",
    "import json\n",
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "id": "c82b6ea7-b2c3-42fb-8e4c-1e8ab2f33810",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████████████████████████████████████████████████| 11300/11300 [00:00<00:00, 20878.02it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Found 45 instances of consecutive assistant messages\n",
      "Found 0 instances where tool message wasn't preceded by assistant\n",
      "Will drop 45 conversations total\n",
      "\n",
      "Original dataset size: 11300\n",
      "Cleaned dataset size: 11255\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "def find_consecutive_indices(dataset):\n",
    "    indices_to_drop = set()\n",
    "    num_consecutive = 0\n",
    "    num_missing_assistant = 0\n",
    "    \n",
    "    for idx in tqdm(range(len(dataset))):\n",
    "        conversations = dataset[idx]['cot_conversations']\n",
    "        \n",
    "        # Check for consecutive assistant messages\n",
    "        for i in range(len(conversations) - 1):\n",
    "            current = conversations[i]['from']\n",
    "            next_msg = conversations[i + 1]['from']\n",
    "            \n",
    "            if ((current in ['gpt', 'assistant']) and \n",
    "                (next_msg in ['gpt', 'assistant'])):\n",
    "                num_consecutive += 1\n",
    "                indices_to_drop.add(idx)\n",
    "                break\n",
    "        \n",
    "        # Check for tool messages without preceding assistant\n",
    "        for i in range(len(conversations)):\n",
    "            current = conversations[i]\n",
    "            if current['from'] in ['tool', 'ipython']:\n",
    "                if i == 0 or conversations[i-1]['from'] not in ['gpt', 'assistant']:\n",
    "                    num_missing_assistant += 1\n",
    "                    indices_to_drop.add(idx)\n",
    "                    break\n",
    "    \n",
    "    return list(indices_to_drop), num_consecutive, num_missing_assistant"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "id": "4083a201-779f-44c4-bc2c-8d1aae30dd98",
   "metadata": {},
   "outputs": [],
   "source": [
    "ds = load_dataset(\"json\",data_files=\"/home/sanyambhutani/task_datasets/1_Downloaded/hermes-function-calling-v1/func-calling.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "id": "c4d7c7c4-a72b-4d67-80aa-b6695dc261fe",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['id', 'conversations', 'cot_conversations'],\n",
       "    num_rows: 11300\n",
       "})"
      ]
     },
     "execution_count": 89,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ds_toolace"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "id": "acff0dad-fb66-45c1-a3f8-6d7a908f9060",
   "metadata": {},
   "outputs": [],
   "source": [
    "ds_toolace = load_from_disk(\"/home/sanyambhutani/task_datasets/3_CoT_added/ToolACE/\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "id": "6d244726-4fd3-445b-bfee-85e1dce9b370",
   "metadata": {},
   "outputs": [],
   "source": [
    "ds_single_turn = load_from_disk(\"/home/sanyambhutani/task_datasets/3_CoT_added/hermes-function-calling-v1/func-calling-single-turn/\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "id": "bc0ea408-d2ae-4278-8497-31e7c59cacf4",
   "metadata": {},
   "outputs": [],
   "source": [
    "ds_multi_turn = load_from_disk(\"/home/sanyambhutani/task_datasets/3_CoT_added/hermes-function-calling-v1/func-calling-multi-turn/\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "id": "893d0121-5d7a-42e0-90a9-9ed5a36e7f91",
   "metadata": {},
   "outputs": [],
   "source": [
    "ds_glaive = load_from_disk(\"/home/sanyambhutani/task_datasets/3_CoT_added/hermes-function-calling-v1/glaive-function-calling/\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "id": "860b68e2-9941-415a-b952-350450c1ea5a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['id', 'conversations', 'cot_conversations'],\n",
       "    num_rows: 11300\n",
       "})"
      ]
     },
     "execution_count": 94,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ds_toolace"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "id": "238073c9-1253-49e0-b453-958a301fcc9c",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████████████████████████████████████████████████| 11300/11300 [00:00<00:00, 19691.10it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Found 45 instances of consecutive assistant messages\n",
      "Found 0 instances where tool message wasn't preceded by assistant\n",
      "Will drop 45 conversations total\n",
      "\n",
      "Original dataset size: 11300\n",
      "Cleaned dataset size: 11255\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "indices_to_drop, num_consecutive, num_missing_assistant = find_consecutive_indices(ds_toolace)\n",
    "print(f\"\\nFound {num_consecutive} instances of consecutive assistant messages\")\n",
    "print(f\"Found {num_missing_assistant} instances where tool message wasn't preceded by assistant\")\n",
    "print(f\"Will drop {len(indices_to_drop)} conversations total\")\n",
    "\n",
    "# Create new dataset without the problematic conversations\n",
    "keep_indices = [i for i in range(len(ds_toolace)) if i not in indices_to_drop]\n",
    "ds_toolace_cleaned = ds_toolace.select(keep_indices)\n",
    "print(f\"\\nOriginal dataset size: {len(ds_toolace)}\")\n",
    "print(f\"Cleaned dataset size: {len(ds_toolace_cleaned)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "id": "a7ebf595-ede5-4e63-ad6e-db82d5c14610",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████████████████████████████████████████████████████| 912/912 [00:00<00:00, 19699.99it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Found 5 instances of consecutive assistant messages\n",
      "Found 0 instances where tool message wasn't preceded by assistant\n",
      "Will drop 5 conversations total\n",
      "\n",
      "Original dataset size: 912\n",
      "Cleaned dataset size: 907\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "indices_to_drop, num_consecutive, num_missing_assistant = find_consecutive_indices(ds_single_turn)\n",
    "print(f\"\\nFound {num_consecutive} instances of consecutive assistant messages\")\n",
    "print(f\"Found {num_missing_assistant} instances where tool message wasn't preceded by assistant\")\n",
    "print(f\"Will drop {len(indices_to_drop)} conversations total\")\n",
    "\n",
    "# Create new dataset without the problematic conversations\n",
    "keep_indices = [i for i in range(len(ds_single_turn)) if i not in indices_to_drop]\n",
    "ds_single_turn_cleaned = ds_single_turn.select(keep_indices)\n",
    "print(f\"\\nOriginal dataset size: {len(ds_single_turn)}\")\n",
    "print(f\"Cleaned dataset size: {len(ds_single_turn_cleaned)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "id": "85562c4b-920f-427d-a889-6d0b38ae94f9",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████████████████████████████████████████████████████| 912/912 [00:00<00:00, 15763.84it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Found 14 instances of consecutive assistant messages\n",
      "Found 1 instances where tool message wasn't preceded by assistant\n",
      "Will drop 15 conversations total\n",
      "\n",
      "Original dataset size: 912\n",
      "Cleaned dataset size: 897\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "indices_to_drop, num_consecutive, num_missing_assistant = find_consecutive_indices(ds_multi_turn)\n",
    "print(f\"\\nFound {num_consecutive} instances of consecutive assistant messages\")\n",
    "print(f\"Found {num_missing_assistant} instances where tool message wasn't preceded by assistant\")\n",
    "print(f\"Will drop {len(indices_to_drop)} conversations total\")\n",
    "\n",
    "# Create new dataset without the problematic conversations\n",
    "keep_indices = [i for i in range(len(ds_multi_turn)) if i not in indices_to_drop]\n",
    "ds_multi_turn_cleaned = ds_multi_turn.select(keep_indices)\n",
    "print(f\"\\nOriginal dataset size: {len(ds_multi_turn)}\")\n",
    "print(f\"Cleaned dataset size: {len(ds_multi_turn_cleaned)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "id": "2f6e3299-a4fe-4976-84a5-bc001c954dee",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████████████████████████████████████████████████████| 2641/2641 [00:00<00:00, 12092.32it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Found 14 instances of consecutive assistant messages\n",
      "Found 0 instances where tool message wasn't preceded by assistant\n",
      "Will drop 14 conversations total\n",
      "\n",
      "Original dataset size: 2641\n",
      "Cleaned dataset size: 2627\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "indices_to_drop, num_consecutive, num_missing_assistant = find_consecutive_indices(ds_glaive)\n",
    "print(f\"\\nFound {num_consecutive} instances of consecutive assistant messages\")\n",
    "print(f\"Found {num_missing_assistant} instances where tool message wasn't preceded by assistant\")\n",
    "print(f\"Will drop {len(indices_to_drop)} conversations total\")\n",
    "\n",
    "# Create new dataset without the problematic conversations\n",
    "keep_indices = [i for i in range(len(ds_glaive)) if i not in indices_to_drop]\n",
    "ds_glaive_cleaned = ds_glaive.select(keep_indices)\n",
    "print(f\"\\nOriginal dataset size: {len(ds_glaive)}\")\n",
    "print(f\"Cleaned dataset size: {len(ds_glaive_cleaned)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "id": "f8e57c1b-fc25-4b45-867b-61c1e8a9bfa3",
   "metadata": {},
   "outputs": [],
   "source": [
    "final_ds = concatenate_datasets([ds_glaive_cleaned, ds_multi_turn_cleaned, ds_toolace_cleaned])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "id": "4a6cd5e5-fab1-4c1a-a82c-94b3ab4c6a0a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['id', 'conversations', 'cot_conversations'],\n",
       "    num_rows: 14779\n",
       "})"
      ]
     },
     "execution_count": 106,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "final_ds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "id": "1f3702f4-5969-4aaa-a3c3-7a73ca247a61",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "fft.py\t\t new-toolcall.py  toolcall.py\n",
      "ft-config.yaml\t prep-for-FT.py   train_data.json\n",
      "new-config.yaml  __pycache__\t  train_final_mix.json\n"
     ]
    }
   ],
   "source": [
    "!ls ../scripts/finetuning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "id": "f3a678ec-b231-4191-81c4-96cc2426b2aa",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "95c235ffd8d64918a1eed401bd0f555e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "112301178"
      ]
     },
     "execution_count": 108,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "final_ds.to_json(\"../scripts/finetuning/train_final_mix.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "id": "df251bf8-fc4a-4a26-8099-c7a65992e84a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "666d9d431bdc4e5baaed8bb356d0d09d",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating train split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "ds = load_dataset(\"json\",data_files=\"../scripts/finetuning/train_final_mix.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "id": "959b4b55-6b05-4087-b75a-5b9e2461a398",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['id', 'conversations', 'cot_conversations'],\n",
       "        num_rows: 14779\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 110,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "id": "db2427b1-3317-434c-8db5-41eadac01c77",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'from': 'system',\n",
       "  'value': 'You are an expert in composing functions. You are given a question and a set of possible functions.\\nBased on the question, you will need to make one or more function/tool calls to achieve the purpose.\\nIf none of the function can be used, point it out. If the given question lacks the parameters required by the function,\\nalso point it out. You should only return the function call in tools call sections.\\n\\nIf you decide to invoke any of the function(s), you MUST put it in the format of [func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)]\\n\\nHere is a list of functions in JSON format that you can invoke[{\"type\": \"function\", \"function\": {\"name\": \"get_stock_price\", \"description\": \"Get the current stock price\", \"parameters\": {\"type\": \"object\", \"properties\": {\"stock_symbol\": {\"type\": \"string\", \"description\": \"The symbol of the stock\"}}, \"required\": [\"stock_symbol\"]}}}, {\"type\": \"function\", \"function\": {\"name\": \"create_todo\", \"description\": \"Create a new todo item\", \"parameters\": {\"type\": \"object\", \"properties\": {\"title\": {\"type\": \"string\", \"description\": \"The title of the todo item\"}, \"description\": {\"type\": \"string\", \"description\": \"The description of the todo item\"}, \"due_date\": {\"type\": \"string\", \"format\": \"date\", \"description\": \"The due date of the todo item\"}}, \"required\": [\"title\", \"due_date\"]}}}]'},\n",
       " {'from': 'human',\n",
       "  'value': \"Hi, can you tell me the current price of Apple's stock?\"},\n",
       " {'from': 'gpt',\n",
       "  'value': \"To answer this question, I need to retrieve the current stock price of Apple. The function 'get_stock_price' seems relevant to this task. It requires a 'stock_symbol' parameter, which in this case should be 'AAPL' for Apple. Therefore, I will call the 'get_stock_price' function with the 'stock_symbol' parameter set to 'AAPL'. [get_stock_price(stock_symbol='AAPL')]\"},\n",
       " {'from': 'tool', 'value': \"{'stock_price': 145.09}\"},\n",
       " {'from': 'gpt',\n",
       "  'value': \"The current price of Apple's stock (AAPL) is $145.09.\"},\n",
       " {'from': 'human',\n",
       "  'value': \"That's great. Can you also tell me the price of Microsoft's stock?\"},\n",
       " {'from': 'gpt',\n",
       "  'value': \"To answer this question, I need to retrieve the current stock price of Microsoft. Similar to the previous question, the 'get_stock_price' function is relevant. This time, the 'stock_symbol' parameter should be set to 'MSFT' for Microsoft. Thus, I will call the 'get_stock_price' function with the 'stock_symbol' parameter set to 'MSFT'. [get_stock_price(stock_symbol='MSFT')]\"},\n",
       " {'from': 'tool', 'value': \"{'stock_price': 284.91}\"},\n",
       " {'from': 'gpt',\n",
       "  'value': \"The current price of Microsoft's stock (MSFT) is $284.91.\"},\n",
       " {'from': 'human', 'value': 'Thank you for the information.'},\n",
       " {'from': 'gpt',\n",
       "  'value': \"You're welcome! If you have any other questions, feel free to ask.\"}]"
      ]
     },
     "execution_count": 71,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ds['train']['cot_conversations'][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 333,
   "id": "b2d41f8d-9eae-45b1-bf10-58824d33da6e",
   "metadata": {},
   "outputs": [],
   "source": [
    "def save_dataset_to_json(dataset, split, output_file):\n",
    "    data_list = [dict(item) for item in dataset[split]]\n",
    "    with open(output_file, 'w', encoding='utf-8') as f:\n",
    "        json.dump(data_list, f, ensure_ascii=False, indent=2)\n",
    "\n",
    "save_dataset_to_json(final_ds, 'cot_conversations', '/home/sanyambhutani/task_datasets/output.json')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 334,
   "id": "24c763a5-4e8d-487e-ab85-c3a811a0358d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "994ec68974f344b6b7246d2c8610ef88",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating train split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "ds = load_dataset(\"json\",data_files=\"/home/sanyambhutani/task_datasets/\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 335,
   "id": "ab5f9c75-19cd-4915-88b6-478399e88af4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['from'],\n",
       "        num_rows: 15697\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 335,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "id": "5853b925-3cd4-4859-9691-5cce7b5a1d6f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total number of consecutive gpt->gpt pairs: 0\n",
      "Number of examples with consecutive gpt messages: 0\n",
      "Percentage of examples with consecutive gpt messages: 0.00%\n"
     ]
    }
   ],
   "source": [
    "def check_consecutive_gpt(conversations):\n",
    "    consecutive_count = 0\n",
    "    for i in range(len(conversations) - 1):\n",
    "        if conversations[i]['from'] == 'gpt' and conversations[i + 1]['from'] == 'gpt':\n",
    "            consecutive_count += 1\n",
    "    return consecutive_count\n",
    "\n",
    "# Count across all examples\n",
    "total_consecutive = 0\n",
    "examples_with_consecutive = 0\n",
    "\n",
    "for example in ds['train']['cot_conversations']:\n",
    "    count = check_consecutive_gpt(example)\n",
    "    if count > 0:\n",
    "        examples_with_consecutive += 1\n",
    "        total_consecutive += count\n",
    "\n",
    "print(f\"Total number of consecutive gpt->gpt pairs: {total_consecutive}\")\n",
    "print(f\"Number of examples with consecutive gpt messages: {examples_with_consecutive}\")\n",
    "print(f\"Percentage of examples with consecutive gpt messages: {(examples_with_consecutive/len(ds['train']))*100:.2f}%\")\n",
    "\n",
    "# Let's also get a specific example to examine\n",
    "for idx, example in enumerate(ds['train']['cot_conversations']):\n",
    "    count = check_consecutive_gpt(example)\n",
    "    if count > 0:\n",
    "        print(f\"\\nFirst example found at index {idx}:\")\n",
    "        for i in range(len(example) - 1):\n",
    "            if example[i]['from'] == 'gpt' and example[i + 1]['from'] == 'gpt':\n",
    "                print(\"\\nFirst message:\", example[i]['value'])\n",
    "                print(\"\\nSecond message:\", example[i + 1]['value'])\n",
    "        break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b744ff1b-c83f-4c6a-b8a7-5207c0213578",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}