瀏覽代碼

rm extra nbs

Sanyam Bhutani 2 月之前
父節點
當前提交
ba76bad847

文件差異過大導致無法顯示
+ 0 - 31
end-to-end-use-cases/data-tool/Notebooks/Looking_at_Datasets.ipynb


+ 9 - 70
end-to-end-use-cases/data-tool/Notebooks/Down-sample-Nous.ipynb

@@ -2,25 +2,6 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 14,
-   "id": "96a4cb34-d8da-4b16-a929-98bfae2ac668",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Updated Git hooks.\n",
-      "Git LFS initialized.\n"
-     ]
-    }
-   ],
-   "source": [
-    "!git lfs install"
-   ]
-  },
-  {
-   "cell_type": "code",
    "execution_count": 15,
    "id": "0978b512-1a88-447e-b80a-ee4b72152038",
    "metadata": {},
@@ -46,13 +27,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 63,
+   "execution_count": null,
    "id": "ec662394-2ae9-4081-87dd-bf75c4f76500",
    "metadata": {},
    "outputs": [],
    "source": [
-    "def analyze_distribution(data: List[Dict]) -> Tuple[Dict, Dict]:\n",
-    "    \"\"\"Analyze category and subcategory distribution.\"\"\"\n",
+    "def analyze_distribution(data):\n",
     "    category_counts = defaultdict(int)\n",
     "    subcategory_counts = defaultdict(int)\n",
     "    category_subcategory_counts = defaultdict(lambda: defaultdict(int))\n",
@@ -277,42 +257,35 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 68,
+   "execution_count": null,
    "id": "2314e5a5-3f1c-432a-8a14-ccd36c0cbae4",
    "metadata": {},
    "outputs": [],
    "source": [
     "# Category mapping for merging\n",
     "CATEGORY_MAPPING = {\n",
-    "    # Simulacra merging\n",
     "    'Simulacrum Agent': 'Simulacra Agents',\n",
     "    'Simulacra Agent': 'Simulacra Agents',\n",
     "    \n",
-    "    # Outlines merging\n",
     "    'Outlines Agents': 'Outlines Agents',\n",
     "    'Outlines Agent': 'Outlines Agents',\n",
     "    \n",
-    "    # Minecraft merging\n",
     "    'Minecraft Agent': 'Minecraft Agents',\n",
     "    'Voyager MineCraft Agent': 'Minecraft Agents',\n",
     "    \n",
-    "    # Framework merging\n",
     "    'Agent Frameworks': 'Development Frameworks',\n",
     "    'Copilot Frameworks': 'Development Frameworks',\n",
     "    \n",
-    "    # Utility agents merging\n",
     "    'AI Analysis Agent': 'Utility Agents',\n",
     "    'Code Analysis Agent': 'Utility Agents',\n",
     "    'File Management Agent': 'Utility Agents',\n",
     "    'Utility Function': 'Utility Agents',\n",
     "    'WebBrowser Agent': 'Utility Agents',\n",
     "    \n",
-    "    # Data processing merging\n",
     "    'Data Structures': 'Data Processing Agents',\n",
     "    'Data Structure': 'Data Processing Agents',\n",
     "    'Data Compression': 'Data Processing Agents',\n",
     "    \n",
-    "    # Keep major categories as is\n",
     "    'DSPy Agents': 'DSPy Agents',\n",
     "    'LLM Agents': 'LLM Agents',\n",
     "    'Instructor Agents': 'Instructor Agents',\n",
@@ -320,22 +293,17 @@
     "    'LlamaIndex Agents': 'LlamaIndex Agents',\n",
     "    'Langchain Agents': 'Langchain Agents',\n",
     "}\n",
-    "\n",
-    "# Default category for any not explicitly mapped\n",
     "DEFAULT_CATEGORY = 'Other Agents'\n",
     "\n",
     "def balance_agentic_dataset(target_size=25):\n",
-    "    # Load dataset\n",
     "    dataset = load_dataset('json', data_files='hermes-function-calling-v1/json-mode-agentic.json')\n",
     "    \n",
-    "    # Group examples by mapped categories\n",
     "    category_groups = defaultdict(list)\n",
     "    for item in dataset['train']:\n",
     "        original_category = item['category']\n",
     "        mapped_category = CATEGORY_MAPPING.get(original_category, DEFAULT_CATEGORY)\n",
     "        category_groups[mapped_category].append(item)\n",
     "    \n",
-    "    # Print original distribution after mapping\n",
     "    print(\"\\nOriginal distribution after category mapping:\")\n",
     "    for cat, items in category_groups.items():\n",
     "        print(f\"{cat}: {len(items)}\")\n",
@@ -344,7 +312,6 @@
     "    balanced_data = []\n",
     "    for category, items in category_groups.items():\n",
     "        if len(items) > target_size:\n",
-    "            # Randomly sample target_size items\n",
     "            sampled_items = random.sample(items, target_size)\n",
     "            balanced_data.extend(sampled_items)\n",
     "        else:\n",
@@ -364,23 +331,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 70,
+   "execution_count": null,
    "id": "d7ebaf2d-fb69-4257-9879-0be534a9d03f",
    "metadata": {},
    "outputs": [],
    "source": [
     "def balance_agentic_dataset(target_size=25):\n",
-    "    # Load dataset\n",
     "    dataset = load_dataset('json', data_files='hermes-function-calling-v1/json-mode-agentic.json')\n",
-    "    \n",
-    "    # Group examples by mapped categories\n",
     "    category_groups = defaultdict(list)\n",
     "    for item in dataset['train']:\n",
     "        original_category = item['category']\n",
     "        mapped_category = CATEGORY_MAPPING.get(original_category, DEFAULT_CATEGORY)\n",
     "        category_groups[mapped_category].append(item)\n",
-    "    \n",
-    "    # Print original distribution after mapping\n",
+    "        \n",
     "    print(\"\\nOriginal distribution after category mapping:\")\n",
     "    for cat, items in category_groups.items():\n",
     "        print(f\"{cat}: {len(items)}\")\n",
@@ -389,14 +352,11 @@
     "    balanced_data = []\n",
     "    for category, items in category_groups.items():\n",
     "        if len(items) > target_size:\n",
-    "            # Randomly sample target_size items\n",
     "            sampled_items = random.sample(items, target_size)\n",
     "            balanced_data.extend(sampled_items)\n",
     "        else:\n",
-    "            # Keep all items if less than target_size\n",
     "            balanced_data.extend(items)\n",
     "    \n",
-    "    # Print final distribution\n",
     "    final_distribution = defaultdict(int)\n",
     "    for item in balanced_data:\n",
     "        mapped_category = CATEGORY_MAPPING.get(item['category'], DEFAULT_CATEGORY)\n",
@@ -546,21 +506,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 87,
+   "execution_count": null,
    "id": "b879509f-c8ed-42b4-9edd-b3b0056d2196",
    "metadata": {},
    "outputs": [],
    "source": [
     "def balance_func_dataset(target_size=25):\n",
-    "    # Load dataset\n",
     "    dataset = load_dataset('json', data_files='hermes-function-calling-v1/func-calling.json')\n",
     "    \n",
-    "    # Group examples by category\n",
     "    category_groups = defaultdict(list)\n",
     "    for item in dataset['train']:\n",
     "        category_groups[item['category']].append(item)\n",
     "    \n",
-    "    # Print original distribution\n",
     "    print(\"\\nOriginal distribution:\")\n",
     "    for cat, items in category_groups.items():\n",
     "        print(f\"{cat}: {len(items)}\")\n",
@@ -569,14 +526,11 @@
     "    balanced_data = []\n",
     "    for category, items in category_groups.items():\n",
     "        if len(items) > target_size:\n",
-    "            # Randomly sample target_size items\n",
     "            sampled_items = random.sample(items, target_size)\n",
     "            balanced_data.extend(sampled_items)\n",
     "        else:\n",
-    "            # Keep all items if less than target_size\n",
     "            balanced_data.extend(items)\n",
     "    \n",
-    "    # Print final distribution\n",
     "    final_distribution = defaultdict(int)\n",
     "    for item in balanced_data:\n",
     "        final_distribution[item['category']] += 1\n",
@@ -595,7 +549,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 88,
+   "execution_count": null,
    "id": "d1fc2cd0-0cdf-4e04-baa0-b90bae27a76a",
    "metadata": {},
    "outputs": [
@@ -715,20 +669,16 @@
    "source": [
     "def save_as_hf_dataset(balanced_data, output_path='balanced_func_calling'):\n",
     "    \"\"\"Save the balanced dataset as a Hugging Face dataset.\"\"\"\n",
-    "    # Convert to Dataset format\n",
     "    balanced_dataset = Dataset.from_list(balanced_data)\n",
     "    \n",
-    "    # Create DatasetDict with train split\n",
     "    dataset_dict = DatasetDict({\n",
     "        'train': balanced_dataset\n",
     "    })\n",
     "    \n",
-    "    # Save dataset\n",
     "    dataset_dict.save_to_disk(output_path)\n",
     "    print(f\"\\nSaved balanced dataset to {output_path}\")\n",
     "    return dataset_dict\n",
     "\n",
-    "# Run the balancing and save\n",
     "balanced_data = balance_func_dataset(25)\n",
     "dataset_dict = save_as_hf_dataset(balanced_data)"
    ]
@@ -743,7 +693,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 91,
+   "execution_count": null,
    "id": "a2d8eec4-97df-4abb-b248-e9236a777fd6",
    "metadata": {},
    "outputs": [
@@ -845,34 +795,29 @@
     "import math\n",
     "\n",
     "def downsample_and_tag_dataset(dataset, source_name, target_total=150):\n",
-    "    # Group examples by category\n",
     "    category_groups = defaultdict(list)\n",
     "    for item in dataset['train']:\n",
     "        category_groups[item['category']].append(item)\n",
     "    \n",
     "    num_categories = len(category_groups)\n",
-    "    # Calculate samples per category to achieve target total\n",
     "    samples_per_category = max(1, math.floor(target_total / num_categories))\n",
     "    \n",
     "    print(f\"\\n{source_name}:\")\n",
     "    print(f\"Number of categories: {num_categories}\")\n",
     "    print(f\"Samples per category: {samples_per_category}\")\n",
     "    \n",
-    "    # Balance dataset\n",
     "    balanced_data = []\n",
     "    for category, items in category_groups.items():\n",
     "        if len(items) > samples_per_category:\n",
     "            sampled_items = random.sample(items, samples_per_category)\n",
     "            balanced_data.extend(sampled_items)\n",
     "        else:\n",
-    "            # For categories with fewer examples than target, keep all\n",
     "            balanced_data.extend(items)\n",
     "    \n",
     "    # Add source tag to each example\n",
     "    for item in balanced_data:\n",
     "        item['dataset_source'] = source_name\n",
     "    \n",
-    "    # Print distribution\n",
     "    final_distribution = defaultdict(int)\n",
     "    for item in balanced_data:\n",
     "        final_distribution[item['category']] += 1\n",
@@ -887,34 +832,28 @@
     "    return balanced_data\n",
     "\n",
     "def merge_and_save_datasets(target_per_dataset=150):\n",
-    "    # Load datasets\n",
     "    func_single = load_dataset('json', data_files='hermes-function-calling-v1/func-calling-singleturn.json')\n",
     "    json_single = load_dataset('json', data_files='hermes-function-calling-v1/json-mode-singleturn.json')\n",
     "    \n",
-    "    # Print original sizes\n",
     "    print(f\"Original func_single size: {len(func_single['train'])}\")\n",
     "    print(f\"Original json_single size: {len(json_single['train'])}\")\n",
     "    \n",
-    "    # Downsample and tag each dataset\n",
     "    func_balanced = downsample_and_tag_dataset(func_single, 'func_calling_singleturn', target_per_dataset)\n",
     "    json_balanced = downsample_and_tag_dataset(json_single, 'json_mode_singleturn', target_per_dataset)\n",
     "    \n",
     "    # Merge datasets\n",
     "    merged_data = func_balanced + json_balanced\n",
     "    \n",
-    "    # Create and save merged dataset\n",
     "    merged_dataset = Dataset.from_list(merged_data)\n",
     "    dataset_dict = DatasetDict({\n",
     "        'train': merged_dataset\n",
     "    })\n",
-    "    \n",
-    "    # Print final statistics\n",
+    "\n",
     "    print(\"\\nFinal merged dataset statistics:\")\n",
     "    print(f\"Total examples: {len(merged_data)}\")\n",
     "    print(f\"From func_calling_singleturn: {len(func_balanced)}\")\n",
     "    print(f\"From json_mode_singleturn: {len(json_balanced)}\")\n",
     "    \n",
-    "    # Save dataset\n",
     "    output_path = 'balanced_singleturn_merged'\n",
     "    dataset_dict.save_to_disk(output_path)\n",
     "    print(f\"\\nSaved merged dataset to {output_path}\")\n",

文件差異過大導致無法顯示
+ 14 - 224
end-to-end-use-cases/data-tool/Notebooks/Detailed-EDA-ToolACE.ipynb


+ 9 - 28
end-to-end-use-cases/data-tool/Notebooks/Pre-Process-Downsampled-Nous.ipynb

@@ -41,14 +41,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 212,
+   "execution_count": null,
    "id": "eb31fa2a-2cf5-4eaf-bba7-747702103576",
    "metadata": {},
    "outputs": [],
    "source": [
     "from datasets import load_from_disk, DatasetDict\n",
     "import json\n",
-    "import re"
+    "import re\n",
+    "import ast"
    ]
   },
   {
@@ -141,7 +142,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 190,
+   "execution_count": null,
    "id": "6987e0ed-d64a-4451-943e-c16146ebb522",
    "metadata": {},
    "outputs": [
@@ -184,15 +185,9 @@
     "            conv[\"from\"] = \"assistant\"\n",
     "    return example\n",
     "\n",
-    "\n",
-    "# Load dataset\n",
     "glaive_balanced = load_from_disk(\"balanced-json-modeagentic\")\n",
-    "\n",
-    "# Apply preprocessing\n",
     "processed_dataset = glaive_balanced.map(preprocess_conversation)\n",
-    "\n",
-    "# Save dataset\n",
-    "processed_dataset.save_to_disk(\"json-agentic-balanced-final\")\n"
+    "processed_dataset.save_to_disk(\"json-agentic-balanced-final\")"
    ]
   },
   {
@@ -423,7 +418,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 231,
+   "execution_count": null,
    "id": "08478295-2f80-4e0c-8ecd-5eab64a2b42e",
    "metadata": {},
    "outputs": [],
@@ -441,7 +436,6 @@
     "    return base_prompt + \"\\n\" + tools_json\n",
     "\n",
     "def update_conversations(conv_list, tools_json):\n",
-    "    # Update system prompt\n",
     "    conv_list[0]['value'] = create_new_system_prompt(tools_json)\n",
     "    return conv_list\n",
     "\n",
@@ -528,7 +522,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 235,
+   "execution_count": null,
    "id": "a648abdd-d118-464b-be36-9795319e5095",
    "metadata": {},
    "outputs": [],
@@ -537,15 +531,11 @@
     "   # Remove XML tags and convert JSON-like string to function call format\n",
     "   # From: \"<tool_call>\\n{'name': 'calculate_interest', 'arguments': {'principal': 5000, 'rate': 0.05, 'time': 3}}\\n</tool_call>\"\n",
     "   # To: \"[calculate_interest(principal=5000, rate=0.05, time=3)]\"\n",
-    "   \n",
-    "   import ast\n",
-    "   # Remove XML tags and get the dict\n",
+    "   # Remove XML\n",
     "   if \"<tool_call>\" in json_str:\n",
     "       json_str = json_str.replace(\"<tool_call>\", \"\").replace(\"</tool_call>\", \"\").strip()\n",
-    "   \n",
-    "   # Parse string to dict\n",
+    "       \n",
     "   data = ast.literal_eval(json_str)\n",
-    "   \n",
     "   # Format to function call style\n",
     "   args = [f\"{k}={v}\" for k,v in data['arguments'].items()]\n",
     "   return f\"[{data['name']}({', '.join(args)})]\"\n",
@@ -554,7 +544,6 @@
     "   # Remove XML tags and convert to just the value\n",
     "   # From: \"<tool_response>\\n{'interest': 750}\\n</tool_response>\"\n",
     "   # To: \"750\"\n",
-    "   import ast\n",
     "   if \"<tool_response>\" in json_str:\n",
     "       json_str = json_str.replace(\"<tool_response>\", \"\").replace(\"</tool_response>\", \"\").strip()\n",
     "   \n",
@@ -1444,14 +1433,6 @@
    "source": [
     "!zip -r \"/home/sanyambhutani/final-singleturnfunccall.zip\" \"/home/sanyambhutani/single-turn-func-call-and-agent-final/\""
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "72c8fca9-7083-44d5-b6f0-4f7153df681d",
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {

文件差異過大導致無法顯示
+ 0 - 120
end-to-end-use-cases/data-tool/Notebooks/CoT-ToolAce.ipynb


+ 0 - 23
end-to-end-use-cases/data-tool/Notebooks/Detailed-EDA-XLAM.ipynb

@@ -1,23 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "3a988c0b-d7a0-47d2-a0d4-6e35f222ff96",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "",
-   "name": ""
-  },
-  "language_info": {
-   "name": ""
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}

文件差異過大導致無法顯示
+ 0 - 346
end-to-end-use-cases/data-tool/Notebooks/Pre-Process-Nous.ipynb


+ 2 - 41
end-to-end-use-cases/data-tool/Notebooks/Detailed-EDA-Nous.ipynb

@@ -244,13 +244,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": null,
    "id": "e69878b1-961d-4170-819e-570f0ddc96b8",
    "metadata": {},
    "outputs": [],
    "source": [
     "def format_function_call(tool_call_str):\n",
-    "    # Extract function name and arguments from the old format\n",
     "    try:\n",
     "        import ast\n",
     "        tool_call = ast.literal_eval(tool_call_str.replace('<tool_call>\\n', '').replace('</tool_call>\\n', ''))\n",
@@ -263,7 +262,7 @@
     "        return tool_call_str\n",
     "\n",
     "def clean_assistant_response(response):\n",
-    "   # Remove XML tags and formatting\n",
+    "   # Remove XML\n",
     "   response = response.replace('<tool_call>', '').replace('</tool_call>', '')\n",
     "   response = response.strip('[]').strip()\n",
     "   \n",
@@ -775,36 +774,6 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 56,
-   "id": "4797fbe0-85b2-4b44-8bd8-452d76458ab3",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "Sample Conversation Analysis:\n",
-      "\n",
-      "Category: Model APIs\n",
-      "Task: Regression\n",
-      "Available Functions: []\n",
-      "Called Functions: []\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(\"\\nSample Conversation Analysis:\")\n",
-    "sample_idx = np.random.randint(len(df))\n",
-    "sample = df.iloc[sample_idx]\n",
-    "print(f\"\\nCategory: {sample['category']}\")\n",
-    "print(f\"Task: {sample['task']}\")\n",
-    "print(f\"Available Functions: {sample['available_functions']}\")\n",
-    "print(f\"Called Functions: {sample['called_functions']}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
    "execution_count": 57,
    "id": "1a855e95-9c20-4357-b3c0-0a5436a51fb9",
    "metadata": {},
@@ -920,14 +889,6 @@
     "print(\"\\nSummary Statistics by Category:\")\n",
     "print(summary_stats)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "eed363d4-c922-40b8-8ad6-f9e98a851a63",
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {

end-to-end-use-cases/data-tool/scripts/add_cot.py → end-to-end-use-cases/data-tool/scripts/annotation-inference/add_cot.py


end-to-end-use-cases/data-tool/scripts/add_cot_vllm.py → end-to-end-use-cases/data-tool/scripts/annotation-inference/add_cot_vllm.py