{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "572d0ab1-36be-4ed3-be72-c316c7f7f142", "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset, load_from_disk, concatenate_datasets\n", "from datasets import Dataset, DatasetDict\n", "import json\n", "from tqdm import tqdm" ] }, { "cell_type": "code", "execution_count": 96, "id": "c82b6ea7-b2c3-42fb-8e4c-1e8ab2f33810", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|███████████████████████████████████████████████████████████████████████████| 11300/11300 [00:00<00:00, 20878.02it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Found 45 instances of consecutive assistant messages\n", "Found 0 instances where tool message wasn't preceded by assistant\n", "Will drop 45 conversations total\n", "\n", "Original dataset size: 11300\n", "Cleaned dataset size: 11255\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "def find_consecutive_indices(dataset):\n", " indices_to_drop = set()\n", " num_consecutive = 0\n", " num_missing_assistant = 0\n", " \n", " for idx in tqdm(range(len(dataset))):\n", " conversations = dataset[idx]['cot_conversations']\n", " \n", " # Check for consecutive assistant messages\n", " for i in range(len(conversations) - 1):\n", " current = conversations[i]['from']\n", " next_msg = conversations[i + 1]['from']\n", " \n", " if ((current in ['gpt', 'assistant']) and \n", " (next_msg in ['gpt', 'assistant'])):\n", " num_consecutive += 1\n", " indices_to_drop.add(idx)\n", " break\n", " \n", " # Check for tool messages without preceding assistant\n", " for i in range(len(conversations)):\n", " current = conversations[i]\n", " if current['from'] in ['tool', 'ipython']:\n", " if i == 0 or conversations[i-1]['from'] not in ['gpt', 'assistant']:\n", " num_missing_assistant += 1\n", " indices_to_drop.add(idx)\n", " break\n", " \n", " return list(indices_to_drop), num_consecutive, num_missing_assistant" ] }, { "cell_type": "code", "execution_count": 88, "id": "4083a201-779f-44c4-bc2c-8d1aae30dd98", "metadata": {}, "outputs": [], "source": [ "ds = load_dataset(\"json\",data_files=\"/home/sanyambhutani/task_datasets/1_Downloaded/hermes-function-calling-v1/func-calling.json\")" ] }, { "cell_type": "code", "execution_count": 89, "id": "c4d7c7c4-a72b-4d67-80aa-b6695dc261fe", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Dataset({\n", " features: ['id', 'conversations', 'cot_conversations'],\n", " num_rows: 11300\n", "})" ] }, "execution_count": 89, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ds_toolace" ] }, { "cell_type": "code", "execution_count": 90, "id": "acff0dad-fb66-45c1-a3f8-6d7a908f9060", "metadata": {}, "outputs": [], "source": [ "ds_toolace = load_from_disk(\"/home/sanyambhutani/task_datasets/3_CoT_added/ToolACE/\")" ] }, { "cell_type": "code", "execution_count": 91, "id": "6d244726-4fd3-445b-bfee-85e1dce9b370", "metadata": {}, "outputs": [], "source": [ "ds_single_turn = load_from_disk(\"/home/sanyambhutani/task_datasets/3_CoT_added/hermes-function-calling-v1/func-calling-single-turn/\")" ] }, { "cell_type": "code", "execution_count": 92, "id": "bc0ea408-d2ae-4278-8497-31e7c59cacf4", "metadata": {}, "outputs": [], "source": [ "ds_multi_turn = load_from_disk(\"/home/sanyambhutani/task_datasets/3_CoT_added/hermes-function-calling-v1/func-calling-multi-turn/\")" ] }, { "cell_type": "code", "execution_count": 93, "id": "893d0121-5d7a-42e0-90a9-9ed5a36e7f91", "metadata": {}, "outputs": [], "source": [ "ds_glaive = load_from_disk(\"/home/sanyambhutani/task_datasets/3_CoT_added/hermes-function-calling-v1/glaive-function-calling/\")" ] }, { "cell_type": "code", "execution_count": 94, "id": "860b68e2-9941-415a-b952-350450c1ea5a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Dataset({\n", " features: ['id', 'conversations', 'cot_conversations'],\n", " num_rows: 11300\n", "})" ] }, "execution_count": 94, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ds_toolace" ] }, { "cell_type": "code", "execution_count": 100, "id": "238073c9-1253-49e0-b453-958a301fcc9c", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|███████████████████████████████████████████████████████████████████████████| 11300/11300 [00:00<00:00, 19691.10it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Found 45 instances of consecutive assistant messages\n", "Found 0 instances where tool message wasn't preceded by assistant\n", "Will drop 45 conversations total\n", "\n", "Original dataset size: 11300\n", "Cleaned dataset size: 11255\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "indices_to_drop, num_consecutive, num_missing_assistant = find_consecutive_indices(ds_toolace)\n", "print(f\"\\nFound {num_consecutive} instances of consecutive assistant messages\")\n", "print(f\"Found {num_missing_assistant} instances where tool message wasn't preceded by assistant\")\n", "print(f\"Will drop {len(indices_to_drop)} conversations total\")\n", "\n", "# Create new dataset without the problematic conversations\n", "keep_indices = [i for i in range(len(ds_toolace)) if i not in indices_to_drop]\n", "ds_toolace_cleaned = ds_toolace.select(keep_indices)\n", "print(f\"\\nOriginal dataset size: {len(ds_toolace)}\")\n", "print(f\"Cleaned dataset size: {len(ds_toolace_cleaned)}\")" ] }, { "cell_type": "code", "execution_count": 101, "id": "a7ebf595-ede5-4e63-ad6e-db82d5c14610", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|███████████████████████████████████████████████████████████████████████████████| 912/912 [00:00<00:00, 19699.99it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Found 5 instances of consecutive assistant messages\n", "Found 0 instances where tool message wasn't preceded by assistant\n", "Will drop 5 conversations total\n", "\n", "Original dataset size: 912\n", "Cleaned dataset size: 907\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "indices_to_drop, num_consecutive, num_missing_assistant = find_consecutive_indices(ds_single_turn)\n", "print(f\"\\nFound {num_consecutive} instances of consecutive assistant messages\")\n", "print(f\"Found {num_missing_assistant} instances where tool message wasn't preceded by assistant\")\n", "print(f\"Will drop {len(indices_to_drop)} conversations total\")\n", "\n", "# Create new dataset without the problematic conversations\n", "keep_indices = [i for i in range(len(ds_single_turn)) if i not in indices_to_drop]\n", "ds_single_turn_cleaned = ds_single_turn.select(keep_indices)\n", "print(f\"\\nOriginal dataset size: {len(ds_single_turn)}\")\n", "print(f\"Cleaned dataset size: {len(ds_single_turn_cleaned)}\")" ] }, { "cell_type": "code", "execution_count": 102, "id": "85562c4b-920f-427d-a889-6d0b38ae94f9", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|███████████████████████████████████████████████████████████████████████████████| 912/912 [00:00<00:00, 15763.84it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Found 14 instances of consecutive assistant messages\n", "Found 1 instances where tool message wasn't preceded by assistant\n", "Will drop 15 conversations total\n", "\n", "Original dataset size: 912\n", "Cleaned dataset size: 897\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "indices_to_drop, num_consecutive, num_missing_assistant = find_consecutive_indices(ds_multi_turn)\n", "print(f\"\\nFound {num_consecutive} instances of consecutive assistant messages\")\n", "print(f\"Found {num_missing_assistant} instances where tool message wasn't preceded by assistant\")\n", "print(f\"Will drop {len(indices_to_drop)} conversations total\")\n", "\n", "# Create new dataset without the problematic conversations\n", "keep_indices = [i for i in range(len(ds_multi_turn)) if i not in indices_to_drop]\n", "ds_multi_turn_cleaned = ds_multi_turn.select(keep_indices)\n", "print(f\"\\nOriginal dataset size: {len(ds_multi_turn)}\")\n", "print(f\"Cleaned dataset size: {len(ds_multi_turn_cleaned)}\")" ] }, { "cell_type": "code", "execution_count": 104, "id": "2f6e3299-a4fe-4976-84a5-bc001c954dee", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|█████████████████████████████████████████████████████████████████████████████| 2641/2641 [00:00<00:00, 12092.32it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Found 14 instances of consecutive assistant messages\n", "Found 0 instances where tool message wasn't preceded by assistant\n", "Will drop 14 conversations total\n", "\n", "Original dataset size: 2641\n", "Cleaned dataset size: 2627\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "indices_to_drop, num_consecutive, num_missing_assistant = find_consecutive_indices(ds_glaive)\n", "print(f\"\\nFound {num_consecutive} instances of consecutive assistant messages\")\n", "print(f\"Found {num_missing_assistant} instances where tool message wasn't preceded by assistant\")\n", "print(f\"Will drop {len(indices_to_drop)} conversations total\")\n", "\n", "# Create new dataset without the problematic conversations\n", "keep_indices = [i for i in range(len(ds_glaive)) if i not in indices_to_drop]\n", "ds_glaive_cleaned = ds_glaive.select(keep_indices)\n", "print(f\"\\nOriginal dataset size: {len(ds_glaive)}\")\n", "print(f\"Cleaned dataset size: {len(ds_glaive_cleaned)}\")" ] }, { "cell_type": "code", "execution_count": 105, "id": "f8e57c1b-fc25-4b45-867b-61c1e8a9bfa3", "metadata": {}, "outputs": [], "source": [ "final_ds = concatenate_datasets([ds_glaive_cleaned, ds_multi_turn_cleaned, ds_toolace_cleaned])" ] }, { "cell_type": "code", "execution_count": 106, "id": "4a6cd5e5-fab1-4c1a-a82c-94b3ab4c6a0a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Dataset({\n", " features: ['id', 'conversations', 'cot_conversations'],\n", " num_rows: 14779\n", "})" ] }, "execution_count": 106, "metadata": {}, "output_type": "execute_result" } ], "source": [ "final_ds" ] }, { "cell_type": "code", "execution_count": 107, "id": "1f3702f4-5969-4aaa-a3c3-7a73ca247a61", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "fft.py\t\t new-toolcall.py toolcall.py\n", "ft-config.yaml\t prep-for-FT.py train_data.json\n", "new-config.yaml __pycache__\t train_final_mix.json\n" ] } ], "source": [ "!ls ../scripts/finetuning" ] }, { "cell_type": "code", "execution_count": 108, "id": "f3a678ec-b231-4191-81c4-96cc2426b2aa", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "95c235ffd8d64918a1eed401bd0f555e", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Creating json from Arrow format: 0%| | 0/15 [00:00gpt pairs: 0\n", "Number of examples with consecutive gpt messages: 0\n", "Percentage of examples with consecutive gpt messages: 0.00%\n" ] } ], "source": [ "def check_consecutive_gpt(conversations):\n", " consecutive_count = 0\n", " for i in range(len(conversations) - 1):\n", " if conversations[i]['from'] == 'gpt' and conversations[i + 1]['from'] == 'gpt':\n", " consecutive_count += 1\n", " return consecutive_count\n", "\n", "# Count across all examples\n", "total_consecutive = 0\n", "examples_with_consecutive = 0\n", "\n", "for example in ds['train']['cot_conversations']:\n", " count = check_consecutive_gpt(example)\n", " if count > 0:\n", " examples_with_consecutive += 1\n", " total_consecutive += count\n", "\n", "print(f\"Total number of consecutive gpt->gpt pairs: {total_consecutive}\")\n", "print(f\"Number of examples with consecutive gpt messages: {examples_with_consecutive}\")\n", "print(f\"Percentage of examples with consecutive gpt messages: {(examples_with_consecutive/len(ds['train']))*100:.2f}%\")\n", "\n", "# Let's also get a specific example to examine\n", "for idx, example in enumerate(ds['train']['cot_conversations']):\n", " count = check_consecutive_gpt(example)\n", " if count > 0:\n", " print(f\"\\nFirst example found at index {idx}:\")\n", " for i in range(len(example) - 1):\n", " if example[i]['from'] == 'gpt' and example[i + 1]['from'] == 'gpt':\n", " print(\"\\nFirst message:\", example[i]['value'])\n", " print(\"\\nSecond message:\", example[i + 1]['value'])\n", " break" ] }, { "cell_type": "code", "execution_count": null, "id": "b744ff1b-c83f-4c6a-b8a7-5207c0213578", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.16" } }, "nbformat": 4, "nbformat_minor": 5 }