{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Imports & Env Setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/justinai/miniconda3/envs/prompt-migration/lib/python3.10/site-packages/pydantic/_internal/_config.py:345: UserWarning: Valid config keys have changed in V2:\n",
      "* 'fields' has been removed\n",
      "  warnings.warn(message, UserWarning)\n"
     ]
    },
    {
     "ename": "ModuleNotFoundError",
     "evalue": "No module named 'prompt_migrator'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[1], line 10\u001b[0m\n\u001b[1;32m      8\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mdspy\u001b[39;00m\n\u001b[1;32m      9\u001b[0m sys\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mappend(os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mabspath(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m../\u001b[39m\u001b[38;5;124m'\u001b[39m))\n\u001b[0;32m---> 10\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mprompt_migrator\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mbenchmarks\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m llama_mmlu_pro, leaderboard_mmlu_pro\n",
      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'prompt_migrator'"
     ]
    }
   ],
   "source": [
    "%reload_ext autoreload\n",
    "%autoreload 2\n",
    "import sys\n",
    "import os\n",
    "from dotenv import load_dotenv\n",
    "load_dotenv()\n",
    "\n",
    "import dspy\n",
    "sys.path.append(os.path.abspath('../'))\n",
    "from prompt_migrator.benchmarks import llama_mmlu_pro, leaderboard_mmlu_pro"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'/home/justinai/Code/llama-cookbook/end-to-end-use-cases/prompt-migration/v2/notebooks'"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pwd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Configuration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "NUM_THREADS = 16\n",
    "\n",
    "FEW_SHOTS = 5\n",
    "\n",
    "# See https://docs.litellm.ai/docs/providers/vllm for details\n",
    "TASK_MODEL = dspy.LM(\n",
    "    \"hosted_vllm/meta-llama/Llama-3.3-70B-Instruct\",\n",
    "    api_base = 'http://localhost:8000/v1' , # or api_base ?\n",
    "    # api_version: Optional[str] = None,\n",
    "    # api_key: Optional[str] = None,\n",
    "    # seed: Optional[int] = None,\n",
    "    # max_tokens: Optional[int] = None,\n",
    "    # timeout: Optional[Union[float, int]] = None,\n",
    ")\n",
    "PROMPT_MODEL = dspy.LM(\n",
    "    \"hosted_vllm/meta-llama/Llama-3.3-70B-Instruct\",\n",
    "    api_base = 'http://localhost:8000/v1', # or api_base ?\n",
    "    # api_version: Optional[str] = None,\n",
    "    # api_key: Optional[str] = None,\n",
    "    # seed: Optional[int] = None,\n",
    "    # max_tokens: Optional[int] = None,\n",
    "    # timeout: Optional[Union[float, int]] = None,\n",
    ")\n",
    "\n",
    "dspy.configure(lm=TASK_MODEL)\n",
    "\n",
    "# replace this with llama_mmlu_pro or whatever\n",
    "benchmark = llama_mmlu_pro\n",
    "\n",
    "# Without chain of thought:\n",
    "# program = dspy.Predict(\n",
    "#     benchmark.signature(\"\")\n",
    "# )\n",
    "\n",
    "# With chain of thought:\n",
    "program = dspy.ChainOfThought(\n",
    "    benchmark.signature(\"\") # put your initial system prompt here, or leave blank\n",
    ")\n",
    "\n",
    "evaluate = dspy.Evaluate(\n",
    "    devset=[],\n",
    "    metric=benchmark.metric,\n",
    "    num_threads=NUM_THREADS,\n",
    "    display_progress=True,\n",
    "    display_table=True,\n",
    "    return_all_scores=True,\n",
    "    return_outputs=True,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "trainset, valset, testset = benchmark.datasets(\n",
    "    train_size=0.1,\n",
    "    validation_size=0.2,\n",
    ")\n",
    "\n",
    "len(trainset), len(valset), len(testset)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Optimize Subset + Evaluation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 17:44:49 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
      "RUNNING WITH THE FOLLOWING LIGHT AUTO RUN SETTINGS:\n",
      "num_trials: 7\n",
      "minibatch: False\n",
      "num_candidates: 5\n",
      "valset size: 20\n",
      "\n",
      "2025/01/15 17:44:49 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
      "==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==\n",
      "2025/01/15 17:44:49 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.\n",
      "\n",
      "2025/01/15 17:44:49 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=5 sets of demonstrations...\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapping set 1/5\n",
      "Bootstrapping set 2/5\n",
      "Bootstrapping set 3/5\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 20%|███████████████████████████████████████████████                                                                                                                                                                                            | 4/20 [00:20<01:23,  5.19s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.\n",
      "Bootstrapping set 4/5\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 40%|██████████████████████████████████████████████████████████████████████████████████████████████                                                                                                                                             | 8/20 [00:52<01:18,  6.56s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 4 full traces after 8 examples for up to 1 rounds, amounting to 8 attempts.\n",
      "Bootstrapping set 5/5\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 20%|███████████████████████████████████████████████                                                                                                                                                                                            | 4/20 [00:21<01:24,  5.29s/it]\n",
      "2025/01/15 17:46:23 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
      "==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==\n",
      "2025/01/15 17:46:23 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 2 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 17:46:47 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
      "Proposing instructions...\n",
      "\n",
      "2025/01/15 17:47:40 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:\n",
      "\n",
      "2025/01/15 17:47:40 INFO dspy.teleprompt.mipro_optimizer_v2: 0: Given the fields `question`, `options`, produce the fields `reasoning`, `answer`.\n",
      "\n",
      "2025/01/15 17:47:40 INFO dspy.teleprompt.mipro_optimizer_v2: 1: To address the task effectively, provide a detailed, step-by-step explanation for your reasoning when answering multiple-choice questions across various subjects, including biology, chemistry, physics, and social sciences. Ensure your response includes the following elements: \n",
      "1. A clear understanding of the question being asked.\n",
      "2. An evaluation of each option based on relevant knowledge and critical thinking.\n",
      "3. A logical deduction of the most appropriate answer.\n",
      "4. A concise summary of your reasoning process.\n",
      "5. The final answer choice selected from the provided options.\n",
      "\n",
      "When constructing your response, consider the complexity and diversity of the questions, and tailor your reasoning to demonstrate a broad range of knowledge and analytical skills. This approach will facilitate the development of a robust and reliable question-answering system capable of handling a wide spectrum of educational and general knowledge queries.\n",
      "\n",
      "2025/01/15 17:47:40 INFO dspy.teleprompt.mipro_optimizer_v2: 2: To answer multiple-choice questions that require reasoning and analysis of the subject matter, follow these steps: \n",
      "\n",
      "1. Read the question carefully and identify the key concepts and information provided.\n",
      "2. Analyze the options and determine which ones are plausible based on the information provided in the question.\n",
      "3. Use a combination of natural language processing and knowledge retrieval to generate a step-by-step reasoning process to arrive at an answer.\n",
      "4. Evaluate the reasoning process and select the most appropriate answer based on the analysis.\n",
      "5. Provide a clear and concise explanation of the reasoning process used to arrive at the answer.\n",
      "\n",
      "Given the fields `question`, `options`, produce the fields `reasoning`, `answer` by following the above steps and using a language model to generate a step-by-step explanation of how to arrive at the answer.\n",
      "\n",
      "2025/01/15 17:47:40 INFO dspy.teleprompt.mipro_optimizer_v2: 3: To address the given task effectively, I propose the following instruction: \n",
      "\n",
      "\"Given a multiple-choice question across various subjects, including but not limited to biology, chemistry, physics, and social sciences, and a list of possible options, generate a detailed, step-by-step reasoning process to arrive at the correct answer. Consider the context, key concepts, and any relevant theories or principles that apply to the question. Ensure the reasoning is clear, logical, and easy to follow, and conclude by selecting the correct answer from the provided options.\n",
      "\n",
      "2025/01/15 17:47:40 INFO dspy.teleprompt.mipro_optimizer_v2: 4: You are a highly skilled expert in a high-stakes testing environment, and your task is to answer a series of challenging multiple-choice questions to determine your suitability for a prestigious position. The questions will cover a wide range of subjects, including biology, chemistry, physics, and social sciences. You must carefully read each question, analyze the options, and provide a step-by-step reasoning process to arrive at the correct answer. The correct answer and your reasoning will be evaluated by a panel of judges, and your performance will determine your eligibility for the position. Given the fields `question`, `options`, produce the fields `reasoning`, `answer` to demonstrate your expertise and critical thinking skills.\n",
      "\n",
      "2025/01/15 17:47:40 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
      "\n",
      "2025/01/15 17:47:40 INFO dspy.teleprompt.mipro_optimizer_v2: Evaluating the default program...\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average Metric: 5.00 / 20 (25.0%): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:24<00:00,  1.24s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 17:48:05 INFO dspy.evaluate.evaluate: Average Metric: 5 / 20 (25.0%)\n",
      "2025/01/15 17:48:05 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 25.0\n",
      "\n",
      "2025/01/15 17:48:05 INFO dspy.teleprompt.mipro_optimizer_v2: ==> STEP 3: FINDING OPTIMAL PROMPT PARAMETERS <==\n",
      "2025/01/15 17:48:05 INFO dspy.teleprompt.mipro_optimizer_v2: We will evaluate the program over a series of trials with different combinations of instructions and few-shot examples to find the optimal combination using Bayesian Optimization.\n",
      "\n",
      "2025/01/15 17:48:05 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 1 / 7 =====\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 15.00 / 20 (75.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:23<00:00,  1.19s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 17:48:29 INFO dspy.evaluate.evaluate: Average Metric: 15 / 20 (75.0%)\n",
      "2025/01/15 17:48:29 INFO dspy.teleprompt.mipro_optimizer_v2: \u001b[92mBest full score so far!\u001b[0m Score: 75.0\n",
      "2025/01/15 17:48:29 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 75.0 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 1'].\n",
      "2025/01/15 17:48:29 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [25.0, 75.0]\n",
      "2025/01/15 17:48:29 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 75.0\n",
      "2025/01/15 17:48:29 INFO dspy.teleprompt.mipro_optimizer_v2: =======================\n",
      "\n",
      "\n",
      "2025/01/15 17:48:29 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 2 / 7 =====\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 16.00 / 20 (80.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:22<00:00,  1.13s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 17:48:52 INFO dspy.evaluate.evaluate: Average Metric: 16 / 20 (80.0%)\n",
      "2025/01/15 17:48:52 INFO dspy.teleprompt.mipro_optimizer_v2: \u001b[92mBest full score so far!\u001b[0m Score: 80.0\n",
      "2025/01/15 17:48:52 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 80.0 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 1'].\n",
      "2025/01/15 17:48:52 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [25.0, 75.0, 80.0]\n",
      "2025/01/15 17:48:52 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 80.0\n",
      "2025/01/15 17:48:52 INFO dspy.teleprompt.mipro_optimizer_v2: =======================\n",
      "\n",
      "\n",
      "2025/01/15 17:48:52 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 3 / 7 =====\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 15.00 / 20 (75.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:22<00:00,  1.14s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 17:49:15 INFO dspy.evaluate.evaluate: Average Metric: 15 / 20 (75.0%)\n",
      "2025/01/15 17:49:15 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 75.0 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 1'].\n",
      "2025/01/15 17:49:15 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [25.0, 75.0, 80.0, 75.0]\n",
      "2025/01/15 17:49:15 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 80.0\n",
      "2025/01/15 17:49:15 INFO dspy.teleprompt.mipro_optimizer_v2: =======================\n",
      "\n",
      "\n",
      "2025/01/15 17:49:15 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 4 / 7 =====\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 16.00 / 20 (80.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 1428.33it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 17:49:15 INFO dspy.evaluate.evaluate: Average Metric: 16 / 20 (80.0%)\n",
      "2025/01/15 17:49:15 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 80.0 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 1'].\n",
      "2025/01/15 17:49:15 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [25.0, 75.0, 80.0, 75.0, 80.0]\n",
      "2025/01/15 17:49:15 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 80.0\n",
      "2025/01/15 17:49:15 INFO dspy.teleprompt.mipro_optimizer_v2: =======================\n",
      "\n",
      "\n",
      "2025/01/15 17:49:15 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 5 / 7 =====\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 14.00 / 20 (70.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:25<00:00,  1.30s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 17:49:41 INFO dspy.evaluate.evaluate: Average Metric: 14 / 20 (70.0%)\n",
      "2025/01/15 17:49:41 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 70.0 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 3'].\n",
      "2025/01/15 17:49:41 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [25.0, 75.0, 80.0, 75.0, 80.0, 70.0]\n",
      "2025/01/15 17:49:41 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 80.0\n",
      "2025/01/15 17:49:41 INFO dspy.teleprompt.mipro_optimizer_v2: =======================\n",
      "\n",
      "\n",
      "2025/01/15 17:49:41 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 6 / 7 =====\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 15.00 / 20 (75.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:24<00:00,  1.25s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 17:50:06 INFO dspy.evaluate.evaluate: Average Metric: 15 / 20 (75.0%)\n",
      "2025/01/15 17:50:06 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 75.0 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 1'].\n",
      "2025/01/15 17:50:06 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [25.0, 75.0, 80.0, 75.0, 80.0, 70.0, 75.0]\n",
      "2025/01/15 17:50:06 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 80.0\n",
      "2025/01/15 17:50:06 INFO dspy.teleprompt.mipro_optimizer_v2: =======================\n",
      "\n",
      "\n",
      "2025/01/15 17:50:06 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 7 / 7 =====\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 15.00 / 20 (75.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:26<00:00,  1.34s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 17:50:33 INFO dspy.evaluate.evaluate: Average Metric: 15 / 20 (75.0%)\n",
      "2025/01/15 17:50:33 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 75.0 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 4'].\n",
      "2025/01/15 17:50:33 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [25.0, 75.0, 80.0, 75.0, 80.0, 70.0, 75.0, 75.0]\n",
      "2025/01/15 17:50:33 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 80.0\n",
      "2025/01/15 17:50:33 INFO dspy.teleprompt.mipro_optimizer_v2: =======================\n",
      "\n",
      "\n",
      "2025/01/15 17:50:33 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 80.0!\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "subset_size = 20\n",
    "optimizer = dspy.MIPROv2(\n",
    "    metric=benchmark.metric,\n",
    "    auto=\"light\",\n",
    "    num_threads=NUM_THREADS,\n",
    "    task_model=TASK_MODEL,\n",
    "    prompt_model=PROMPT_MODEL,\n",
    "    max_labeled_demos=FEW_SHOTS,\n",
    ")\n",
    "\n",
    "optimized_program = optimizer.compile(\n",
    "    program,\n",
    "    trainset=trainset[:subset_size],\n",
    "    valset=valset[:subset_size],\n",
    "    requires_permission_to_run=False,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "BEST PROMPT:\n",
      " To answer multiple-choice questions that require reasoning and analysis of the subject matter, follow these steps: \n",
      "\n",
      "1. Read the question carefully and identify the key concepts and information provided.\n",
      "2. Analyze the options and determine which ones are plausible based on the information provided in the question.\n",
      "3. Use a combination of natural language processing and knowledge retrieval to generate a step-by-step reasoning process to arrive at an answer.\n",
      "4. Evaluate the reasoning process and select the most appropriate answer based on the analysis.\n",
      "5. Provide a clear and concise explanation of the reasoning process used to arrive at the answer.\n",
      "\n",
      "Given the fields `question`, `options`, produce the fields `reasoning`, `answer` by following the above steps and using a language model to generate a step-by-step explanation of how to arrive at the answer.\n"
     ]
    }
   ],
   "source": [
    "print(\"BEST PROMPT:\\n\", optimized_program.predict.signature.instructions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average Metric: 13.00 / 20 (65.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:26<00:00,  1.32s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 17:52:03 INFO dspy.evaluate.evaluate: Average Metric: 13 / 20 (65.0%)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>question</th>\n",
       "      <th>options</th>\n",
       "      <th>example_answer</th>\n",
       "      <th>reasoning</th>\n",
       "      <th>pred_answer</th>\n",
       "      <th>metric</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>A certain map uses a scale of 1 inch equals 25 miles. How many mil...</td>\n",
       "      <td>[A. 50, B. 150, C. 100, D. 25, E. 5, F. 200, G. 75, H. 125, I. 175...</td>\n",
       "      <td>H</td>\n",
       "      <td>To find the number of miles represented by 5 inches on the map, we...</td>\n",
       "      <td>B</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>When a ball at rest hangs by a single vertical string tension in t...</td>\n",
       "      <td>['A. may be equal to mg depending on the speed of the ball', 'B. i...</td>\n",
       "      <td>D</td>\n",
       "      <td>To solve this problem, we need to consider the forces acting on th...</td>\n",
       "      <td>D</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>A 125-kW, 250-V, 1800-rev/min, cumulative compound d-c generator h...</td>\n",
       "      <td>['A. 90%, 200 hp', 'B. 87.5%, 195 hp', 'C. 84.7%, 175 hp', 'D. 88....</td>\n",
       "      <td>D</td>\n",
       "      <td>To find the efficiency and input horsepower requirements of the cu...</td>\n",
       "      <td>E</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>The diameter of the objective lens of a telescope is 30 mm and the...</td>\n",
       "      <td>[A. 80%, B. 70%, C. 100%, D. 30%, E. 40%, F. 90%, G. 25%, H. 60%, ...</td>\n",
       "      <td>B</td>\n",
       "      <td>To determine the fraction of the area of the objective lens that i...</td>\n",
       "      <td>B</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>The temperature at the point $(x, y, z)$ in a substance with condu...</td>\n",
       "      <td>[A. $800\\pi$, B. $1504\\pi$, C. $1248\\pi$, D. $1560\\pi$, E. $960\\pi...</td>\n",
       "      <td>C</td>\n",
       "      <td>To find the rate of heat flow inward across the cylindrical surfac...</td>\n",
       "      <td>C</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Which of the following must be done when universal screening data ...</td>\n",
       "      <td>['A. The school must invest in more modern educational technology....</td>\n",
       "      <td>G</td>\n",
       "      <td>When universal screening data show that very few students are succ...</td>\n",
       "      <td>G</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Sulfurylchloride decomposes according to the equation, SO_2CI_2(g)...</td>\n",
       "      <td>['A. 1.49 × 10^-2, .0386', 'B. 7.45 × 10^-3, .0542', 'C. 2.98 × 10...</td>\n",
       "      <td>A</td>\n",
       "      <td>To find the equilibrium constant K_p, we can use the equation: K_p...</td>\n",
       "      <td>A</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>A number’s prime factors are 2, 5, 7, 13, and 31. Which of the fol...</td>\n",
       "      <td>[A. 10, B. 25, C. 6, D. 8, E. 15, F. 30, G. 20, H. 4]</td>\n",
       "      <td>A</td>\n",
       "      <td>To find a factor that must be a factor of the number, we need to l...</td>\n",
       "      <td>A</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>How many ribs are there in the human body?</td>\n",
       "      <td>[A. 42, B. 24, C. 18, D. 36, E. 40, F. 28, G. 22, H. 30, I. 20, J....</td>\n",
       "      <td>B</td>\n",
       "      <td>The human body typically has 12 pairs of ribs, which are attached ...</td>\n",
       "      <td>B</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>To what extent have biological agents been weaponized?</td>\n",
       "      <td>['A. Military professionals believe that the majority of biologica...</td>\n",
       "      <td>C</td>\n",
       "      <td>The question asks about the extent to which biological agents have...</td>\n",
       "      <td>A</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>A swimming pool is circular with a $40-\\mathrm{ft}$ diameter. The ...</td>\n",
       "      <td>['A. $1800\\\\pi$ $\\\\mathrm{ft}^3$', 'B. $1400\\\\pi$ $\\\\mathrm{ft}^3$...</td>\n",
       "      <td>A</td>\n",
       "      <td>To find the volume of water in the pool, we first need to understa...</td>\n",
       "      <td>A</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>Determine the heat of vaporization ofacetonitrilewith a normal boi...</td>\n",
       "      <td>['A. 7.8 kcal mol^-1', 'B. 6.8 kcal mol^-1', 'C. 7.0 kcal mol^-1',...</td>\n",
       "      <td>E</td>\n",
       "      <td>To determine the heat of vaporization of acetonitrile, we can use ...</td>\n",
       "      <td>A</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>Feinberg claims that the best way to pursue happiness is to:</td>\n",
       "      <td>['A. strive for success.', 'B. pursue knowledge.', 'C. pursue happ...</td>\n",
       "      <td>G</td>\n",
       "      <td>Feinberg's claim is related to the concept of happiness and how to...</td>\n",
       "      <td>G</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>Consider a profit-maximizing firm in a perfectly competitive marke...</td>\n",
       "      <td>[\"A. lead the firm to reduce workers' wages as they now contribute...</td>\n",
       "      <td>B</td>\n",
       "      <td>In a perfectly competitive market, firms are price takers, meaning...</td>\n",
       "      <td>B is incorrect because while the firm will hire more workers, the ...</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>Which Confucian philosopher is described as a mystic due to his f...</td>\n",
       "      <td>[A. Zhuangzi, B. Dao, C. Xunzi, D. Laozi, E. Zisi, F. Wang Yangmin...</td>\n",
       "      <td>G</td>\n",
       "      <td>The question asks for a Confucian philosopher who is described as ...</td>\n",
       "      <td>A</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>What children's TV character is known as 'Da Niao' in China?</td>\n",
       "      <td>['A. Barney', 'B. Tinky Winky', 'C. Mickey Mouse', 'D. Winnie the ...</td>\n",
       "      <td>E</td>\n",
       "      <td>The question asks for a children's TV character known as 'Da Niao'...</td>\n",
       "      <td>E</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>A buyer filed a lawsuit against a seller based on a written contra...</td>\n",
       "      <td>['A. admissible, because the original contract was lost.', 'B. ina...</td>\n",
       "      <td>J</td>\n",
       "      <td>The best evidence rule generally requires that the original docume...</td>\n",
       "      <td>A</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>Mr. Williams owns a piece of property assessed at $7,800 in a city...</td>\n",
       "      <td>[A. $218.40, B. $220.00, C. $234.00, D. $210.00, E. $225.60, F. $2...</td>\n",
       "      <td>A</td>\n",
       "      <td>To find the amount of property taxes Mr. Williams pays, we need to...</td>\n",
       "      <td>A</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>An externality</td>\n",
       "      <td>['A. results in a deficit of goods in the market', 'B. results in ...</td>\n",
       "      <td>G</td>\n",
       "      <td>An externality refers to a situation where the production or consu...</td>\n",
       "      <td>G</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>If GNP = $2,000 billion and the velocity of money is 4, what isthe...</td>\n",
       "      <td>['A. $300 billion', 'B. $250 billion', 'C. $700 billion', 'D. $100...</td>\n",
       "      <td>J</td>\n",
       "      <td>To find the money supply, we can use the equation: M * V = GNP, wh...</td>\n",
       "      <td>J</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                                 question  \\\n",
       "0   A certain map uses a scale of 1 inch equals 25 miles. How many mil...   \n",
       "1   When a ball at rest hangs by a single vertical string tension in t...   \n",
       "2   A 125-kW, 250-V, 1800-rev/min, cumulative compound d-c generator h...   \n",
       "3   The diameter of the objective lens of a telescope is 30 mm and the...   \n",
       "4   The temperature at the point $(x, y, z)$ in a substance with condu...   \n",
       "5   Which of the following must be done when universal screening data ...   \n",
       "6   Sulfurylchloride decomposes according to the equation, SO_2CI_2(g)...   \n",
       "7   A number’s prime factors are 2, 5, 7, 13, and 31. Which of the fol...   \n",
       "8                              How many ribs are there in the human body?   \n",
       "9                  To what extent have biological agents been weaponized?   \n",
       "10  A swimming pool is circular with a $40-\\mathrm{ft}$ diameter. The ...   \n",
       "11  Determine the heat of vaporization ofacetonitrilewith a normal boi...   \n",
       "12           Feinberg claims that the best way to pursue happiness is to:   \n",
       "13  Consider a profit-maximizing firm in a perfectly competitive marke...   \n",
       "14   Which Confucian philosopher is described as a mystic due to his f...   \n",
       "15           What children's TV character is known as 'Da Niao' in China?   \n",
       "16  A buyer filed a lawsuit against a seller based on a written contra...   \n",
       "17  Mr. Williams owns a piece of property assessed at $7,800 in a city...   \n",
       "18                                                         An externality   \n",
       "19  If GNP = $2,000 billion and the velocity of money is 4, what isthe...   \n",
       "\n",
       "                                                                  options  \\\n",
       "0   [A. 50, B. 150, C. 100, D. 25, E. 5, F. 200, G. 75, H. 125, I. 175...   \n",
       "1   ['A. may be equal to mg depending on the speed of the ball', 'B. i...   \n",
       "2   ['A. 90%, 200 hp', 'B. 87.5%, 195 hp', 'C. 84.7%, 175 hp', 'D. 88....   \n",
       "3   [A. 80%, B. 70%, C. 100%, D. 30%, E. 40%, F. 90%, G. 25%, H. 60%, ...   \n",
       "4   [A. $800\\pi$, B. $1504\\pi$, C. $1248\\pi$, D. $1560\\pi$, E. $960\\pi...   \n",
       "5   ['A. The school must invest in more modern educational technology....   \n",
       "6   ['A. 1.49 × 10^-2, .0386', 'B. 7.45 × 10^-3, .0542', 'C. 2.98 × 10...   \n",
       "7                   [A. 10, B. 25, C. 6, D. 8, E. 15, F. 30, G. 20, H. 4]   \n",
       "8   [A. 42, B. 24, C. 18, D. 36, E. 40, F. 28, G. 22, H. 30, I. 20, J....   \n",
       "9   ['A. Military professionals believe that the majority of biologica...   \n",
       "10  ['A. $1800\\\\pi$ $\\\\mathrm{ft}^3$', 'B. $1400\\\\pi$ $\\\\mathrm{ft}^3$...   \n",
       "11  ['A. 7.8 kcal mol^-1', 'B. 6.8 kcal mol^-1', 'C. 7.0 kcal mol^-1',...   \n",
       "12  ['A. strive for success.', 'B. pursue knowledge.', 'C. pursue happ...   \n",
       "13  [\"A. lead the firm to reduce workers' wages as they now contribute...   \n",
       "14  [A. Zhuangzi, B. Dao, C. Xunzi, D. Laozi, E. Zisi, F. Wang Yangmin...   \n",
       "15  ['A. Barney', 'B. Tinky Winky', 'C. Mickey Mouse', 'D. Winnie the ...   \n",
       "16  ['A. admissible, because the original contract was lost.', 'B. ina...   \n",
       "17  [A. $218.40, B. $220.00, C. $234.00, D. $210.00, E. $225.60, F. $2...   \n",
       "18  ['A. results in a deficit of goods in the market', 'B. results in ...   \n",
       "19  ['A. $300 billion', 'B. $250 billion', 'C. $700 billion', 'D. $100...   \n",
       "\n",
       "   example_answer  \\\n",
       "0               H   \n",
       "1               D   \n",
       "2               D   \n",
       "3               B   \n",
       "4               C   \n",
       "5               G   \n",
       "6               A   \n",
       "7               A   \n",
       "8               B   \n",
       "9               C   \n",
       "10              A   \n",
       "11              E   \n",
       "12              G   \n",
       "13              B   \n",
       "14              G   \n",
       "15              E   \n",
       "16              J   \n",
       "17              A   \n",
       "18              G   \n",
       "19              J   \n",
       "\n",
       "                                                                reasoning  \\\n",
       "0   To find the number of miles represented by 5 inches on the map, we...   \n",
       "1   To solve this problem, we need to consider the forces acting on th...   \n",
       "2   To find the efficiency and input horsepower requirements of the cu...   \n",
       "3   To determine the fraction of the area of the objective lens that i...   \n",
       "4   To find the rate of heat flow inward across the cylindrical surfac...   \n",
       "5   When universal screening data show that very few students are succ...   \n",
       "6   To find the equilibrium constant K_p, we can use the equation: K_p...   \n",
       "7   To find a factor that must be a factor of the number, we need to l...   \n",
       "8   The human body typically has 12 pairs of ribs, which are attached ...   \n",
       "9   The question asks about the extent to which biological agents have...   \n",
       "10  To find the volume of water in the pool, we first need to understa...   \n",
       "11  To determine the heat of vaporization of acetonitrile, we can use ...   \n",
       "12  Feinberg's claim is related to the concept of happiness and how to...   \n",
       "13  In a perfectly competitive market, firms are price takers, meaning...   \n",
       "14  The question asks for a Confucian philosopher who is described as ...   \n",
       "15  The question asks for a children's TV character known as 'Da Niao'...   \n",
       "16  The best evidence rule generally requires that the original docume...   \n",
       "17  To find the amount of property taxes Mr. Williams pays, we need to...   \n",
       "18  An externality refers to a situation where the production or consu...   \n",
       "19  To find the money supply, we can use the equation: M * V = GNP, wh...   \n",
       "\n",
       "                                                              pred_answer  \\\n",
       "0                                                                       B   \n",
       "1                                                                       D   \n",
       "2                                                                       E   \n",
       "3                                                                       B   \n",
       "4                                                                       C   \n",
       "5                                                                       G   \n",
       "6                                                                       A   \n",
       "7                                                                       A   \n",
       "8                                                                       B   \n",
       "9                                                                       A   \n",
       "10                                                                      A   \n",
       "11                                                                      A   \n",
       "12                                                                      G   \n",
       "13  B is incorrect because while the firm will hire more workers, the ...   \n",
       "14                                                                      A   \n",
       "15                                                                      E   \n",
       "16                                                                      A   \n",
       "17                                                                      A   \n",
       "18                                                                      G   \n",
       "19                                                                      J   \n",
       "\n",
       "       metric  \n",
       "0              \n",
       "1   ✔️ [True]  \n",
       "2              \n",
       "3   ✔️ [True]  \n",
       "4   ✔️ [True]  \n",
       "5   ✔️ [True]  \n",
       "6   ✔️ [True]  \n",
       "7   ✔️ [True]  \n",
       "8   ✔️ [True]  \n",
       "9              \n",
       "10  ✔️ [True]  \n",
       "11             \n",
       "12  ✔️ [True]  \n",
       "13             \n",
       "14             \n",
       "15  ✔️ [True]  \n",
       "16             \n",
       "17  ✔️ [True]  \n",
       "18  ✔️ [True]  \n",
       "19  ✔️ [True]  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "score, results, all_scores = evaluate(\n",
    "    optimized_program,\n",
    "    devset=testset[:subset_size],\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Medium Optimization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 18:09:23 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
      "RUNNING WITH THE FOLLOWING MEDIUM AUTO RUN SETTINGS:\n",
      "num_trials: 25\n",
      "minibatch: True\n",
      "num_candidates: 19\n",
      "valset size: 300\n",
      "\n",
      "2025/01/15 18:09:23 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
      "==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==\n",
      "2025/01/15 18:09:23 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.\n",
      "\n",
      "2025/01/15 18:09:23 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=19 sets of demonstrations...\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapping set 1/19\n",
      "Bootstrapping set 2/19\n",
      "Bootstrapping set 3/19\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  1%|█▏                                                                                                                                                      | 4/500 [00:19<40:37,  4.91s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.\n",
      "Bootstrapping set 4/19\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  1%|█▏                                                                                                                                                      | 4/500 [00:17<36:04,  4.36s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 3 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.\n",
      "Bootstrapping set 5/19\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  1%|▉                                                                                                                                                     | 3/500 [00:32<1:28:42, 10.71s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 1 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.\n",
      "Bootstrapping set 6/19\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  1%|█▏                                                                                                                                                      | 4/500 [00:17<36:47,  4.45s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.\n",
      "Bootstrapping set 7/19\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  1%|▉                                                                                                                                                       | 3/500 [00:10<28:15,  3.41s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 2 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.\n",
      "Bootstrapping set 8/19\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|▎                                                                                                                                                       | 1/500 [00:03<26:50,  3.23s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.\n",
      "Bootstrapping set 9/19\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|▌                                                                                                                                                     | 2/500 [00:20<1:26:27, 10.42s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 1 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.\n",
      "Bootstrapping set 10/19\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  1%|█▌                                                                                                                                                    | 5/500 [01:04<1:47:03, 12.98s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 3 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.\n",
      "Bootstrapping set 11/19\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  1%|▉                                                                                                                                                     | 3/500 [00:21<1:00:43,  7.33s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.\n",
      "Bootstrapping set 12/19\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  1%|█▊                                                                                                                                                    | 6/500 [01:01<1:23:58, 10.20s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 3 full traces after 6 examples for up to 1 rounds, amounting to 6 attempts.\n",
      "Bootstrapping set 13/19\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  1%|█▊                                                                                                                                                      | 6/500 [00:37<51:56,  6.31s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 3 full traces after 6 examples for up to 1 rounds, amounting to 6 attempts.\n",
      "Bootstrapping set 14/19\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  1%|▉                                                                                                                                                       | 3/500 [00:14<40:19,  4.87s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 2 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.\n",
      "Bootstrapping set 15/19\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  1%|█▌                                                                                                                                                      | 5/500 [00:30<50:42,  6.15s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 4 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.\n",
      "Bootstrapping set 16/19\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  1%|█▌                                                                                                                                                      | 5/500 [00:26<44:25,  5.39s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 4 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.\n",
      "Bootstrapping set 17/19\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  1%|▉                                                                                                                                                       | 3/500 [00:09<25:43,  3.11s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.\n",
      "Bootstrapping set 18/19\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|▎                                                                                                                                                       | 1/500 [00:05<49:46,  5.98s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.\n",
      "Bootstrapping set 19/19\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  2%|██▍                                                                                                                                                     | 8/500 [00:52<53:34,  6.53s/it]\n",
      "2025/01/15 18:16:51 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
      "==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==\n",
      "2025/01/15 18:16:51 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 3 full traces after 8 examples for up to 1 rounds, amounting to 8 attempts.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 18:34:39 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
      "Proposing instructions...\n",
      "\n",
      "2025/01/15 19:22:49 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:\n",
      "\n",
      "2025/01/15 19:22:49 INFO dspy.teleprompt.mipro_optimizer_v2: 0: Given the fields `question`, `options`, produce the fields `reasoning`, `answer`.\n",
      "\n",
      "2025/01/15 19:22:49 INFO dspy.teleprompt.mipro_optimizer_v2: 1: To answer a multiple-choice question, provide a step-by-step thought process explaining how you arrived at your answer. Given a question and a list of possible options, generate a clear and logical reasoning process and select the most appropriate answer from the provided options.\n",
      "\n",
      "2025/01/15 19:22:49 INFO dspy.teleprompt.mipro_optimizer_v2: 2: You are a knowledgeable educator specializing in science and critical thinking. Given the fields `question` and `options`, produce detailed step-by-step `reasoning` and select the most appropriate `answer` from the provided options, ensuring that your reasoning is clear, logical, and based on relevant scientific principles or knowledge.\n",
      "\n",
      "2025/01/15 19:22:49 INFO dspy.teleprompt.mipro_optimizer_v2: 3: To address the task effectively, provide a detailed and step-by-step analysis of the given question, considering all available options. Begin by carefully reading and understanding the question, identifying key concepts and any specific details that could influence the answer. Next, evaluate each option in the context of the question, applying relevant knowledge and principles to determine its validity. The reasoning process should be transparent, logical, and well-structured, allowing for the elimination of incorrect options and the identification of the correct answer. Ensure that the reasoning is grounded in established theories, principles, or empirical evidence relevant to the subject matter of the question. Finally, select the answer that best aligns with the conclusions drawn from the analysis, and present it in a clear and concise manner. This approach will facilitate the production of high-quality responses that demonstrate a thorough understanding of the subject matter and the ability to apply critical thinking skills to resolve complex questions.\n",
      "\n",
      "2025/01/15 19:22:49 INFO dspy.teleprompt.mipro_optimizer_v2: 4: To tackle this physics problem, let's break it down into manageable steps. Given a question about calculating the total charge enclosed by a volume defined by a spherical region with a specific electric displacement, and a list of possible answers, generate a detailed, step-by-step reasoning process. This should include applying relevant physical laws, such as Gauss's law, and performing the necessary mathematical calculations to determine the total charge. Ensure the reasoning is clear, concise, and easy to follow, and that it leads to the selection of the correct answer from the provided options. The goal is to demonstrate a thorough understanding of the underlying physics principles and the ability to apply them to solve complex problems.\n",
      "\n",
      "2025/01/15 19:22:49 INFO dspy.teleprompt.mipro_optimizer_v2: 5: To address the task effectively, provide a detailed, step-by-step explanation for the reasoning behind the answer to the given question, considering all provided options. Ensure the reasoning is clear, logical, and directly addresses the question, utilizing relevant concepts or formulas as necessary. Finally, select the correct answer from the options based on the reasoning provided.\n",
      "\n",
      "2025/01/15 19:22:49 INFO dspy.teleprompt.mipro_optimizer_v2: 6: To generate a comprehensive response, carefully analyze the provided question and options. Begin by understanding the context and key components of the question, identifying any specific subjects, concepts, or themes it touches upon. Then, evaluate each option in light of this understanding, considering the relevance, accuracy, and completeness of the information provided in each choice. The reasoning process should involve weighing the pros and cons of each option, looking for the most accurate and comprehensive answer that aligns with the question's requirements. Ensure that the reasoning is step-by-step, clearly explaining the thought process behind eliminating incorrect options and selecting the correct answer. Finally, based on this analysis, provide the answer that best fits the question, accompanied by a detailed explanation of the reasoning process used to arrive at this conclusion. This approach will facilitate the production of high-quality, informative responses that not only provide the correct answer but also offer insight into the decision-making process.\n",
      "\n",
      "2025/01/15 19:22:49 INFO dspy.teleprompt.mipro_optimizer_v2: 7: You are a high-stakes test administrator responsible for evaluating the knowledge and critical thinking skills of students in a prestigious academic competition. The competition's grand prize is a full scholarship to a top-tier university, and the stakes are extremely high. Given the fields `question` and `options`, you must produce the fields `reasoning` and `answer` to demonstrate your ability to think critically and solve complex problems under pressure. The question requires you to apply your knowledge of various subjects, including physics, chemistry, biology, and social sciences, to arrive at a correct solution. You must provide a step-by-step reasoning process, as if you were explaining your thought process to a colleague, and ultimately select the correct answer from the provided options. The fate of the scholarship hangs in the balance, and your response will be scrutinized by a panel of expert judges.\n",
      "\n",
      "2025/01/15 19:22:49 INFO dspy.teleprompt.mipro_optimizer_v2: 8: To tackle this task effectively, provide a detailed, step-by-step explanation for solving the given problem, incorporating relevant scientific principles and formulas. Begin by analyzing the question to identify key elements such as the initial volume of the oil, the pressure it is subjected to, and its compressibility. Then, apply the formula for compressibility to calculate the change in volume, ensuring to convert units as necessary to maintain consistency. Finally, select the answer from the provided options that best matches the calculated decrease in volume, rounding appropriately. The goal is to not only arrive at the correct answer but to demonstrate a clear understanding of the scientific concepts involved through the reasoning process.\n",
      "\n",
      "2025/01/15 19:22:49 INFO dspy.teleprompt.mipro_optimizer_v2: 9: You are a skilled engineer and problem solver. Given the fields `question`, `options`, produce the fields `reasoning`, `answer` by carefully analyzing the problem, applying relevant formulas and principles, and selecting the most appropriate answer based on your calculations and logical deductions. Ensure your reasoning is step-by-step and clearly explains how you arrived at your answer.\n",
      "\n",
      "2025/01/15 19:22:49 INFO dspy.teleprompt.mipro_optimizer_v2: 10: To solve this task, provide a detailed, step-by-step explanation for the given physics or chemistry problem, utilizing relevant formulas and equations. Ensure the reasoning is clear and easy to follow, leading to the selection of the correct answer from the provided options. The explanation should be based on the principles of physics or chemistry applicable to the problem, demonstrating a thorough understanding of the subject matter. Finally, choose the correct answer based on the reasoning provided.\n",
      "\n",
      "2025/01/15 19:22:49 INFO dspy.teleprompt.mipro_optimizer_v2: 11: To tackle this task effectively, you should carefully read and analyze the provided question, considering the subject matter and the specific details given. Then, evaluate each option by applying relevant concepts, formulas, or logical reasoning to determine the most appropriate answer. Your response should include a step-by-step reasoning process that clearly explains how you arrived at your conclusion, making it easy for someone to follow your thought process and understand why a particular answer is correct. Ensure that your reasoning is detailed, accurate, and directly addresses the question asked, providing calculations, explanations, or logical deductions as necessary. Finally, select the answer choice that best aligns with your reasoning and provide it as the final answer.\n",
      "\n",
      "2025/01/15 19:22:49 INFO dspy.teleprompt.mipro_optimizer_v2: 12: To generate a well-structured response for the given problem, please follow these steps: \n",
      "\n",
      "1. Read the question carefully and identify the key elements required to solve it, such as formulas, known values, or specific concepts from physics, chemistry, biology, or social sciences.\n",
      "\n",
      "2. Consider the options provided and think about how each option relates to the question. Eliminate any options that are clearly incorrect based on your initial understanding.\n",
      "\n",
      "3. Develop a step-by-step reasoning process. For physics and chemistry problems, this might involve applying specific formulas or principles. For example, if the question involves dipole moments, use the formula μ = qd, where μ is the dipole moment, q is the magnitude of the partial charge, and d is the distance between the centers of the positive and negative charge distributions. If the question is about atmospheric pressure and mass, use the relationship P = mg/A, where P is pressure, m is mass, g is acceleration due to gravity, and A is surface area.\n",
      "\n",
      "4. For problems that require critical thinking and application of concepts to scenarios, break down the scenario into manageable parts. Identify the key factors that influence the outcome and apply relevant principles or formulas to each part.\n",
      "\n",
      "5. Once you have developed your reasoning, select the answer choice that best aligns with your conclusion. Ensure that your reasoning is sound and that you have considered all relevant information provided in the question.\n",
      "\n",
      "6. Finally, present your answer along with a clear and concise explanation of your reasoning. This should include any calculations, assumptions, or logical deductions made during the problem-solving process.\n",
      "\n",
      "By following these steps and providing detailed reasoning for your answer, you will be able to generate comprehensive and informative responses to the questions.\n",
      "\n",
      "2025/01/15 19:22:49 INFO dspy.teleprompt.mipro_optimizer_v2: 13: You are a critical care specialist in a high-stakes emergency room, and you must make life-or-death decisions quickly and accurately. You have been presented with a complex medical question that requires careful analysis and reasoning to arrive at the correct answer. The question is: [[question]]. You have the following options to choose from: [[options]]. To make an informed decision, you must think step by step, considering all relevant information and weighing the pros and cons of each option. Your task is to produce a clear and logical reasoning process and select the correct answer from the options provided. The lives of your patients depend on your ability to think critically and make sound judgments. Please provide your reasoning and answer.\n",
      "\n",
      "2025/01/15 19:22:49 INFO dspy.teleprompt.mipro_optimizer_v2: 14: To tackle a wide range of questions across various subjects, including physics, chemistry, biology, and social sciences, follow this step-by-step approach. Given a question and a list of possible options, your task is to generate a detailed reasoning process and select the most appropriate answer. Begin by carefully reading and understanding the question, identifying key concepts and any specific information provided. Next, evaluate each option in relation to the question, considering the principles and theories relevant to the subject matter. For questions that require calculations, such as those in physics or chemistry, apply the relevant formulas and perform the necessary computations step by step. In cases where conceptual understanding is key, such as in psychology or biology, think about how the concepts relate to the scenario described in the question. Ensure that your reasoning is logical, well-structured, and clearly explains how you arrive at your conclusion. Finally, based on your analysis, choose the answer that best aligns with your reasoning. The goal is not only to select the correct answer but also to demonstrate a thorough understanding of the subject matter by providing a clear and thoughtful reasoning process.\n",
      "\n",
      "2025/01/15 19:22:49 INFO dspy.teleprompt.mipro_optimizer_v2: 15: Given a question and a list of possible options, provide a step-by-step reasoning process to arrive at the correct answer, utilizing relevant concepts and formulas from various subjects such as physics, mathematics, and history. Ensure the reasoning is clear, concise, and well-structured, leading to a definitive answer choice.\n",
      "\n",
      "2025/01/15 19:22:49 INFO dspy.teleprompt.mipro_optimizer_v2: 16: To generate a well-structured response, please carefully analyze the provided question and options. Begin by reading the question thoroughly and identifying the key concepts or information required to answer it. Then, examine each option in relation to the question, considering how each might be relevant or irrelevant to the correct answer. Next, develop a step-by-step reasoning process that evaluates the options against the question's requirements, using logical deductions and any relevant knowledge or principles that apply. Ensure that your reasoning is clear, concise, and directly addresses how you arrive at the chosen answer from the options provided. Finally, select the answer that best aligns with your reasoning and present it along with the reasoning process in a coherent and understandable format.\n",
      "\n",
      "2025/01/15 19:22:49 INFO dspy.teleprompt.mipro_optimizer_v2: 17: Analyze the given question and options related to legal cases, particularly those involving the Second Amendment and gun control regulations. Using relevant legal precedents such as the Heller and McDonald cases, generate a step-by-step reasoning process to evaluate the constitutionality of the statute in question. Based on this analysis, select the most appropriate answer from the provided options, ensuring that the reasoning is well-supported by legal frameworks and precedents. The goal is to mimic the decision-making process of the U.S. Supreme Court in similar cases, considering the balance between individual rights and public safety regulations.\n",
      "\n",
      "2025/01/15 19:22:49 INFO dspy.teleprompt.mipro_optimizer_v2: 18: Given a question and a list of possible options, generate a step-by-step reasoning process to evaluate the options and arrive at a correct answer. The reasoning should be clear, concise, and mimic human thought, considering the context and evidence provided by the question and options. Ultimately, the goal is to select the most appropriate answer based on the reasoning provided.\n",
      "\n",
      "2025/01/15 19:22:49 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
      "\n",
      "2025/01/15 19:22:49 INFO dspy.teleprompt.mipro_optimizer_v2: Evaluating the default program...\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average Metric: 16.00 / 113 (14.2%):  38%|██████████████████████████████████████████▌                                                                      | 113/300 [01:49<02:55,  1.07it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 19:24:38 ERROR dspy.utils.parallelizer: Error processing item Example({'question': \"Suppose a nanostructure is modelled by an electron confined to a rectangular region with sides of lengths $L_1=1.0 \\\\mathrm{~nm}$ and $L_2=2.0 \\\\mathrm{~nm}$ and is subjected to thermal motion with a typical energy equal to $k T$, where $k$ is Boltzmann's constant. How low should the temperature be for the thermal energy to be comparable to the zero-point energy？\", 'options': ['A. 2.0 $10^2 \\\\mathrm{~K}$', 'B.  5.5 $10^3 \\\\mathrm{~K}$', 'C. 2.5 $10^3 \\\\mathrm{~K}$', 'D. 7.0 $10^3 \\\\mathrm{~K}$', 'E. 1.2 $10^4 \\\\mathrm{~K}$', 'F. 6.5 $10^3 \\\\mathrm{~K}$', 'G. 9.5 $10^3 \\\\mathrm{~K}$', 'H. 1.5 $10^4 \\\\mathrm{~K}$', 'I. 8.0 $10^3 \\\\mathrm{~K}$', 'J. 3.0 $10^3 \\\\mathrm{~K}$'], 'answer': 'B'}) (input_keys={'options', 'question'}): Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']). Set `provide_traceback=True` to see the stack trace.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average Metric: 21.00 / 128 (16.4%):  43%|████████████████████████████████████████████████▌                                                                | 129/300 [02:05<03:05,  1.08s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 19:24:54 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'A tank contains 100 gal of water and $50 \\\\mathrm{oz}$ of salt. Water containing a salt concentration of $\\\\frac{1}{4}\\\\left(1+\\\\frac{1}{2} \\\\sin t\\\\right) \\\\mathrm{oz} / \\\\mathrm{gal}$ flows into the tank at a rate of $2 \\\\mathrm{gal} / \\\\mathrm{min}$, and the mixture in the tank flows out at the same rate.\\nThe long-time behavior of the solution is an oscillation about a certain constant level. What is the amplitude of the oscillation?', 'options': ['A. 0.14995', 'B.  0.24995', 'C. 0.34995', 'D. 0.29995', 'E. 0.50000', 'F. 0.44995', 'G. 0.39995', 'H. 0.19995', 'I. 0.59995', 'J. 0.10000'], 'answer': 'B'}) (input_keys={'options', 'question'}): 'list' object has no attribute 'items'. Set `provide_traceback=True` to see the stack trace.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average Metric: 22.00 / 139 (15.8%):  47%|█████████████████████████████████████████████████████                                                            | 141/300 [02:13<02:07,  1.25it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 19:25:03 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'The isomer of C_2H_5OH(l) is CH_3 - O - CH_3(g) . Given that, at 25°C, the heat of formation of C_2H_5OH(l) = - 66 kcal/mole, the heat of combustion of the isomeric CH_3 - O - CH_3(g) to CO_2(g) and H_2O(l) = - 348 kcal/mole, the heat of formation of H_2O(l) = - 68 kcal/mole and the heat of combustion of carbon to CO_2(g) = - 94 kcal/mole. Calculate a) \\\\DeltaH_298°K for the reaction and b) \\\\DeltaE_298°K for the reaction assuming \\\\DeltaH_298°K in part a) = - 10 kcal.', 'options': ['A. \\\\DeltaH = 22 kcal, \\\\DeltaE = - 10.6 kcal', 'B. \\\\DeltaH = 204 kcal, \\\\DeltaE = - 204 kcal', 'C. \\\\DeltaH = 204 kcal, \\\\DeltaE = 204 kcal', 'D. \\\\DeltaH = 348 kcal, \\\\DeltaE = - 188 kcal', 'E. \\\\DeltaH = 66 kcal, \\\\DeltaE = - 10 kcal', 'F. \\\\DeltaH = -66 kcal, \\\\DeltaE = -10 kcal', 'G. \\\\DeltaH = 66 kcal, \\\\DeltaE = 10 kcal', 'H. \\\\DeltaH = -22 kcal, \\\\DeltaE = 10.6 kcal', 'I. \\\\DeltaH = -348 kcal, \\\\DeltaE = 188 kcal', 'J. \\\\DeltaH = 348 kcal, \\\\DeltaE = 10 kcal'], 'answer': 'A'}) (input_keys={'options', 'question'}): Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']). Set `provide_traceback=True` to see the stack trace.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average Metric: 49.00 / 297 (16.5%): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [04:38<00:00,  1.08it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 19:27:27 INFO dspy.evaluate.evaluate: Average Metric: 49.0 / 300 (16.3%)\n",
      "2025/01/15 19:27:28 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 16.33\n",
      "\n",
      "2025/01/15 19:27:28 INFO dspy.teleprompt.mipro_optimizer_v2: ==> STEP 3: FINDING OPTIMAL PROMPT PARAMETERS <==\n",
      "2025/01/15 19:27:28 INFO dspy.teleprompt.mipro_optimizer_v2: We will evaluate the program over a series of trials with different combinations of instructions and few-shot examples to find the optimal combination using Bayesian Optimization.\n",
      "\n",
      "/home/justinai/miniconda3/envs/prompt-migration/lib/python3.10/site-packages/optuna/_experimental.py:31: ExperimentalWarning: Argument ``multivariate`` is an experimental feature. The interface can change in the future.\n",
      "  warnings.warn(\n",
      "2025/01/15 19:27:28 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 1 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 15.00 / 24 (62.5%):  96%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▎    | 24/25 [00:49<00:07,  7.76s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 19:28:29 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'The isomer of C_2H_5OH(l) is CH_3 - O - CH_3(g) . Given that, at 25°C, the heat of formation of C_2H_5OH(l) = - 66 kcal/mole, the heat of combustion of the isomeric CH_3 - O - CH_3(g) to CO_2(g) and H_2O(l) = - 348 kcal/mole, the heat of formation of H_2O(l) = - 68 kcal/mole and the heat of combustion of carbon to CO_2(g) = - 94 kcal/mole. Calculate a) \\\\DeltaH_298°K for the reaction and b) \\\\DeltaE_298°K for the reaction assuming \\\\DeltaH_298°K in part a) = - 10 kcal.', 'options': ['A. \\\\DeltaH = 22 kcal, \\\\DeltaE = - 10.6 kcal', 'B. \\\\DeltaH = 204 kcal, \\\\DeltaE = - 204 kcal', 'C. \\\\DeltaH = 204 kcal, \\\\DeltaE = 204 kcal', 'D. \\\\DeltaH = 348 kcal, \\\\DeltaE = - 188 kcal', 'E. \\\\DeltaH = 66 kcal, \\\\DeltaE = - 10 kcal', 'F. \\\\DeltaH = -66 kcal, \\\\DeltaE = -10 kcal', 'G. \\\\DeltaH = 66 kcal, \\\\DeltaE = 10 kcal', 'H. \\\\DeltaH = -22 kcal, \\\\DeltaE = 10.6 kcal', 'I. \\\\DeltaH = -348 kcal, \\\\DeltaE = 188 kcal', 'J. \\\\DeltaH = 348 kcal, \\\\DeltaE = 10 kcal'], 'answer': 'A'}) (input_keys={'options', 'question'}): Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']). Set `provide_traceback=True` to see the stack trace.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average Metric: 15.00 / 24 (62.5%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [01:01<00:00,  2.45s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 19:28:29 INFO dspy.evaluate.evaluate: Average Metric: 15.0 / 25 (60.0%)\n",
      "2025/01/15 19:28:29 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 60.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 12', 'Predictor 0: Few-Shot Set 7'].\n",
      "2025/01/15 19:28:29 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [60.0]\n",
      "2025/01/15 19:28:29 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.33]\n",
      "2025/01/15 19:28:29 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 16.33\n",
      "2025/01/15 19:28:29 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n",
      "\n",
      "\n",
      "2025/01/15 19:28:29 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 2 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 17.00 / 25 (68.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:38<00:00,  1.56s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 19:29:08 INFO dspy.evaluate.evaluate: Average Metric: 17 / 25 (68.0%)\n",
      "2025/01/15 19:29:08 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 68.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 10', 'Predictor 0: Few-Shot Set 7'].\n",
      "2025/01/15 19:29:08 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [60.0, 68.0]\n",
      "2025/01/15 19:29:08 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.33]\n",
      "2025/01/15 19:29:08 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 16.33\n",
      "2025/01/15 19:29:08 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n",
      "\n",
      "\n",
      "2025/01/15 19:29:08 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 3 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 18.00 / 25 (72.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:51<00:00,  2.05s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 19:29:59 INFO dspy.evaluate.evaluate: Average Metric: 18 / 25 (72.0%)\n",
      "2025/01/15 19:29:59 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 72.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 18'].\n",
      "2025/01/15 19:29:59 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [60.0, 68.0, 72.0]\n",
      "2025/01/15 19:29:59 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.33]\n",
      "2025/01/15 19:29:59 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 16.33\n",
      "2025/01/15 19:29:59 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n",
      "\n",
      "\n",
      "2025/01/15 19:29:59 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 4 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 17.00 / 25 (68.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [01:05<00:00,  2.63s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 19:31:05 INFO dspy.evaluate.evaluate: Average Metric: 17 / 25 (68.0%)\n",
      "2025/01/15 19:31:05 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 68.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 15', 'Predictor 0: Few-Shot Set 2'].\n",
      "2025/01/15 19:31:05 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [60.0, 68.0, 72.0, 68.0]\n",
      "2025/01/15 19:31:05 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.33]\n",
      "2025/01/15 19:31:05 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 16.33\n",
      "2025/01/15 19:31:05 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n",
      "\n",
      "\n",
      "2025/01/15 19:31:05 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 5 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 18.00 / 25 (72.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:33<00:00,  1.36s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 19:31:39 INFO dspy.evaluate.evaluate: Average Metric: 18 / 25 (72.0%)\n",
      "2025/01/15 19:31:39 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 72.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 8', 'Predictor 0: Few-Shot Set 18'].\n",
      "2025/01/15 19:31:39 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [60.0, 68.0, 72.0, 68.0, 72.0]\n",
      "2025/01/15 19:31:39 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.33]\n",
      "2025/01/15 19:31:39 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 16.33\n",
      "2025/01/15 19:31:39 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n",
      "\n",
      "\n",
      "2025/01/15 19:31:39 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 6 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 17.00 / 25 (68.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:30<00:00,  1.23s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 19:32:10 INFO dspy.evaluate.evaluate: Average Metric: 17 / 25 (68.0%)\n",
      "2025/01/15 19:32:10 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 68.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 1'].\n",
      "2025/01/15 19:32:10 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [60.0, 68.0, 72.0, 68.0, 72.0, 68.0]\n",
      "2025/01/15 19:32:10 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.33]\n",
      "2025/01/15 19:32:10 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 16.33\n",
      "2025/01/15 19:32:10 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n",
      "\n",
      "\n",
      "2025/01/15 19:32:10 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 7 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 17.00 / 25 (68.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:59<00:00,  2.38s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 19:33:10 INFO dspy.evaluate.evaluate: Average Metric: 17 / 25 (68.0%)\n",
      "2025/01/15 19:33:10 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 68.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 12'].\n",
      "2025/01/15 19:33:10 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [60.0, 68.0, 72.0, 68.0, 72.0, 68.0, 68.0]\n",
      "2025/01/15 19:33:10 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.33]\n",
      "2025/01/15 19:33:10 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 16.33\n",
      "2025/01/15 19:33:10 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n",
      "\n",
      "\n",
      "2025/01/15 19:33:10 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 8 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 18.00 / 25 (72.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:52<00:00,  2.12s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 19:34:03 INFO dspy.evaluate.evaluate: Average Metric: 18 / 25 (72.0%)\n",
      "2025/01/15 19:34:03 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 72.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 11', 'Predictor 0: Few-Shot Set 13'].\n",
      "2025/01/15 19:34:03 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [60.0, 68.0, 72.0, 68.0, 72.0, 68.0, 68.0, 72.0]\n",
      "2025/01/15 19:34:03 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.33]\n",
      "2025/01/15 19:34:03 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 16.33\n",
      "2025/01/15 19:34:03 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n",
      "\n",
      "\n",
      "2025/01/15 19:34:03 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 9 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 21.00 / 25 (84.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:38<00:00,  1.53s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 19:34:41 INFO dspy.evaluate.evaluate: Average Metric: 21 / 25 (84.0%)\n",
      "2025/01/15 19:34:41 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 84.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 4'].\n",
      "2025/01/15 19:34:41 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [60.0, 68.0, 72.0, 68.0, 72.0, 68.0, 68.0, 72.0, 84.0]\n",
      "2025/01/15 19:34:41 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.33]\n",
      "2025/01/15 19:34:41 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 16.33\n",
      "2025/01/15 19:34:41 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n",
      "\n",
      "\n",
      "2025/01/15 19:34:41 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 10 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 18.00 / 25 (72.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:22<00:00,  1.11it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 19:35:04 INFO dspy.evaluate.evaluate: Average Metric: 18 / 25 (72.0%)\n",
      "2025/01/15 19:35:04 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 72.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 14', 'Predictor 0: Few-Shot Set 1'].\n",
      "2025/01/15 19:35:04 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [60.0, 68.0, 72.0, 68.0, 72.0, 68.0, 68.0, 72.0, 84.0, 72.0]\n",
      "2025/01/15 19:35:04 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.33]\n",
      "2025/01/15 19:35:04 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 16.33\n",
      "2025/01/15 19:35:04 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
      "\n",
      "\n",
      "2025/01/15 19:35:04 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Full Eval 1 =====\n",
      "2025/01/15 19:35:04 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 84.0) from minibatch trials...\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 49.00 / 73 (67.1%):  24%|███████████████████████████▉                                                                                       | 73/300 [01:27<05:59,  1.58s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 19:36:33 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'A column is used to absorb ammonia in water from a feed of airand ammonia. The column is at 1atmand 0°C. Assume theresistance to transfer is completely in the gas phase, in a gasfilm 1.5 mm thick. The partial pressure of NH_3 is 55 mm Hg at one point in the column. What is the transferrate per unit area at this location in the column? The diffusivityof ammonia in air is 0.198 cm^2 / sec at 0°C.', 'options': ['A. 5.4 × 10^-6gmoles/ cm^2-sec', 'B. 7.1 × 10^-6 gmoles/cm^2-sec', 'C. 2.8 × 10^-6 gmoles/cm^2-sec', 'D. 6.2 × 10^-6gmoles/ cm^2-sec', 'E. 1.6 × 10^-6 gmoles/cm^2-sec', 'F. 3.9 × 10^-6 gmoles/cm^2-sec', 'G. 8.5 × 10^-6 gmoles/cm^2-sec', 'H. 3.5 × 10^-6gmoles/ cm^2-sec', 'I. 4.4 × 10^-6gmoles/ cm^2-sec', 'J. 5.0 × 10^-6 gmoles/cm^2-sec'], 'answer': 'I'}) (input_keys={'options', 'question'}): Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']). Set `provide_traceback=True` to see the stack trace.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average Metric: 89.00 / 131 (67.9%):  44%|█████████████████████████████████████████████████▋                                                               | 132/300 [02:45<02:52,  1.02s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 19:37:52 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'A tank contains 100 gal of water and $50 \\\\mathrm{oz}$ of salt. Water containing a salt concentration of $\\\\frac{1}{4}\\\\left(1+\\\\frac{1}{2} \\\\sin t\\\\right) \\\\mathrm{oz} / \\\\mathrm{gal}$ flows into the tank at a rate of $2 \\\\mathrm{gal} / \\\\mathrm{min}$, and the mixture in the tank flows out at the same rate.\\nThe long-time behavior of the solution is an oscillation about a certain constant level. What is the amplitude of the oscillation?', 'options': ['A. 0.14995', 'B.  0.24995', 'C. 0.34995', 'D. 0.29995', 'E. 0.50000', 'F. 0.44995', 'G. 0.39995', 'H. 0.19995', 'I. 0.59995', 'J. 0.10000'], 'answer': 'B'}) (input_keys={'options', 'question'}): 'list' object has no attribute 'items'. Set `provide_traceback=True` to see the stack trace.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average Metric: 122.00 / 171 (71.3%):  58%|████████████████████████████████████████████████████████████████▌                                               | 173/300 [03:29<01:58,  1.07it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 19:38:35 ERROR dspy.utils.parallelizer: Error processing item Example({'question': ' In the infrared spectrum of $\\\\mathrm{H}^{127} \\\\mathrm{I}$, there is an intense line at $2309 \\\\mathrm{~cm}^{-1}$. Calculate the force constant of $\\\\mathrm{H}^{127} \\\\mathrm{I}$.', 'options': ['A. 400 $ \\\\mathrm{~N} \\\\cdot \\\\mathrm{m}^{-1}$', 'B. 350 $ \\\\mathrm{~N} \\\\cdot \\\\mathrm{m}^{-1}$', 'C. 250 $ \\\\mathrm{~N} \\\\cdot \\\\mathrm{m}^{-1}$', 'D. 285 $\\\\mathrm{~N} \\\\cdot \\\\mathrm{m}^{-1}$', 'E. 450 $\\\\mathrm{~N} \\\\cdot \\\\mathrm{m}^{-1}$', 'F. 365 $\\\\mathrm{~N} \\\\cdot \\\\mathrm{m}^{-1}$', 'G. 313 $ \\\\mathrm{~N} \\\\cdot \\\\mathrm{m}^{-1}$', 'H. 325 $\\\\mathrm{~N} \\\\cdot \\\\mathrm{m}^{-1}$', 'I. 500 $\\\\mathrm{~N} \\\\cdot \\\\mathrm{m}^{-1}$', 'J. 275 $\\\\mathrm{~N} \\\\cdot \\\\mathrm{m}^{-1}$'], 'answer': 'G'}) (input_keys={'options', 'question'}): 'list' object has no attribute 'items'. Set `provide_traceback=True` to see the stack trace.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average Metric: 159.00 / 221 (71.9%):  75%|███████████████████████████████████████████████████████████████████████████████████▋                            | 224/300 [04:24<01:06,  1.15it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 19:39:29 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'A shaft has to transmit 30 hp at 400 rpm. The ultimate shear stress for the steel shaft is 60,000 psi with a factor of safety of 5. Determine the diameter if the shaft selected is to be a) solid and b) hollow with the ratio of inside diameter to outside diameter of 0.5.', 'options': ['A. Solid shaft diameter: 1.625 in, Hollow shaft outer diameter: 1.625 in, Hollow shaft inner diameter: 0.8125 in', 'B. Solid shaft diameter: 1.2 in, Hollow shaft outer diameter: 1.2 in, Hollow shaft inner diameter: 0.6 in', 'C. Solid shaft diameter: 1.4 in, Hollow shaft outer diameter: 1.4 in, Hollow shaft inner diameter: 0.7 in', 'D. Solid shaft diameter: 1.5 in, Hollow shaft outer diameter: 1.5 in, Hollow shaft inner diameter: 0.75 in', 'E. Solid shaft diameter: 1.25 in, Hollow shaft outer diameter: 1.25 in, Hollow shaft inner diameter: 0.625 in', 'F. Solid shaft diameter: 1.375 in, Hollow shaft outer diameter: 1.375 in, Hollow shaft inner diameter: 0.69 in', 'G. Solid shaft diameter: 2 in, Hollow shaft outer diameter: 2 in, Hollow shaft inner diameter: 1 in', 'H. Solid shaft diameter: 1.125 in, Hollow shaft outer diameter: 1.125 in, Hollow shaft inner diameter: 0.5625 in', 'I. Solid shaft diameter: 1 5/16 in, Hollow shaft outer diameter: 1 5/16 in, Hollow shaft inner diameter: 0.66 in', 'J. Solid shaft diameter: 1.75 in, Hollow shaft outer diameter: 1.75 in, Hollow shaft inner diameter: 0.875 in'], 'answer': 'I'}) (input_keys={'options', 'question'}): Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']). Set `provide_traceback=True` to see the stack trace.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average Metric: 179.00 / 248 (72.2%):  84%|██████████████████████████████████████████████████████████████████████████████████████████████                  | 252/300 [04:50<00:50,  1.06s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 19:39:57 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'The conductivity of silver is g = 3 × 10^7 mhos/m at microwave frequencies. Find the skin depth at 10^10Hz. Also calculate the frequency at which skin depth in sea water is one meter.', 'options': ['A. Skin depth in silver = 11.2 × 10^-5 cm, Frequency in sea water = 98.6 × 10^3 Hz', 'B. Skin depth in silver = 8.2 × 10^-5 cm, Frequency in sea water = 88.6 × 10^3 Hz', 'C. Skin depth in silver = 10.2 × 10^-5 cm, Frequency in sea water = 78.6 × 10^3 Hz', 'D. Skin depth in silver = 9.2 × 10^-5 cm, Frequency in sea water = 58.6 × 10^3 Hz', 'E. Skin depth in silver = 4.2 × 10^-5 cm, Frequency in sea water = 18.6 × 10^3 Hz', 'F. Skin depth in silver = 7.2 × 10^-5 cm, Frequency in sea water = 48.6 × 10^3 Hz', 'G. Skin depth in silver = 1.2 × 10^-5 cm, Frequency in sea water = 68.6 × 10^3 Hz', 'H. Skin depth in silver = 5.2 × 10^-5 cm, Frequency in sea water = 28.6 × 10^3 Hz', 'I. Skin depth in silver = 2.2 × 10^-5 cm, Frequency in sea water = 8.6 × 10^3 Hz', 'J. Skin depth in silver = 6.2 × 10^-5 cm, Frequency in sea water = 38.6 × 10^3 Hz'], 'answer': 'D'}) (input_keys={'options', 'question'}): Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']). Set `provide_traceback=True` to see the stack trace.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average Metric: 211.00 / 295 (71.5%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [05:38<00:00,  1.13s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 19:40:43 INFO dspy.evaluate.evaluate: Average Metric: 211.0 / 300 (70.3%)\n",
      "2025/01/15 19:40:43 INFO dspy.teleprompt.mipro_optimizer_v2: \u001b[92mNew best full eval score!\u001b[0m Score: 70.33\n",
      "2025/01/15 19:40:43 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.33, 70.33]\n",
      "2025/01/15 19:40:43 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.33\n",
      "2025/01/15 19:40:43 INFO dspy.teleprompt.mipro_optimizer_v2: =======================\n",
      "2025/01/15 19:40:43 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
      "\n",
      "2025/01/15 19:40:43 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 11 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 19.00 / 25 (76.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:58<00:00,  2.35s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 19:41:41 INFO dspy.evaluate.evaluate: Average Metric: 19 / 25 (76.0%)\n",
      "2025/01/15 19:41:42 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 76.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 4'].\n",
      "2025/01/15 19:41:42 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [60.0, 68.0, 72.0, 68.0, 72.0, 68.0, 68.0, 72.0, 84.0, 72.0, 76.0]\n",
      "2025/01/15 19:41:42 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.33, 70.33]\n",
      "2025/01/15 19:41:42 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.33\n",
      "2025/01/15 19:41:42 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
      "\n",
      "\n",
      "2025/01/15 19:41:42 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 12 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 15.00 / 25 (60.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:00<00:00, 1545.73it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 19:41:42 INFO dspy.evaluate.evaluate: Average Metric: 15 / 25 (60.0%)\n",
      "2025/01/15 19:41:42 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 60.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 4'].\n",
      "2025/01/15 19:41:42 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [60.0, 68.0, 72.0, 68.0, 72.0, 68.0, 68.0, 72.0, 84.0, 72.0, 76.0, 60.0]\n",
      "2025/01/15 19:41:42 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.33, 70.33]\n",
      "2025/01/15 19:41:42 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.33\n",
      "2025/01/15 19:41:42 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
      "\n",
      "\n",
      "2025/01/15 19:41:42 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 13 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 19.00 / 25 (76.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:24<00:00,  1.03it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 19:42:06 INFO dspy.evaluate.evaluate: Average Metric: 19 / 25 (76.0%)\n",
      "2025/01/15 19:42:06 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 76.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 17'].\n",
      "2025/01/15 19:42:06 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [60.0, 68.0, 72.0, 68.0, 72.0, 68.0, 68.0, 72.0, 84.0, 72.0, 76.0, 60.0, 76.0]\n",
      "2025/01/15 19:42:06 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.33, 70.33]\n",
      "2025/01/15 19:42:06 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.33\n",
      "2025/01/15 19:42:06 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
      "\n",
      "\n",
      "2025/01/15 19:42:06 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 14 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 21.00 / 25 (84.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [01:00<00:00,  2.40s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 19:43:06 INFO dspy.evaluate.evaluate: Average Metric: 21 / 25 (84.0%)\n",
      "2025/01/15 19:43:06 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 84.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 4'].\n",
      "2025/01/15 19:43:06 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [60.0, 68.0, 72.0, 68.0, 72.0, 68.0, 68.0, 72.0, 84.0, 72.0, 76.0, 60.0, 76.0, 84.0]\n",
      "2025/01/15 19:43:06 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.33, 70.33]\n",
      "2025/01/15 19:43:06 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.33\n",
      "2025/01/15 19:43:06 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
      "\n",
      "\n",
      "2025/01/15 19:43:06 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 15 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 19.00 / 25 (76.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:37<00:00,  1.50s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 19:43:44 INFO dspy.evaluate.evaluate: Average Metric: 19 / 25 (76.0%)\n",
      "2025/01/15 19:43:44 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 76.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 5'].\n",
      "2025/01/15 19:43:44 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [60.0, 68.0, 72.0, 68.0, 72.0, 68.0, 68.0, 72.0, 84.0, 72.0, 76.0, 60.0, 76.0, 84.0, 76.0]\n",
      "2025/01/15 19:43:44 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.33, 70.33]\n",
      "2025/01/15 19:43:44 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.33\n",
      "2025/01/15 19:43:44 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
      "\n",
      "\n",
      "2025/01/15 19:43:44 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 16 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 18.00 / 24 (75.0%):  96%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▎    | 24/25 [00:33<00:03,  3.24s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 19:44:45 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'A pitcher can throw a baseball weighing 5 ounces so that it will have a velocity of 96 feet per second. What is its kinetic energy?', 'options': ['A. 55 ft-lb', 'B. 45 ft-lb', 'C. 60 ft-lb', 'D. 5 ft-lb', 'E. 24 ft-lb', 'F. 32 ft-lb', 'G. 96 ft-lb', 'H. 15 ft-lb', 'I. 10 ft-lb', 'J. 72 ft-lb'], 'answer': 'B'}) (input_keys={'options', 'question'}): Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']). Set `provide_traceback=True` to see the stack trace.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average Metric: 18.00 / 24 (75.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [01:00<00:00,  2.42s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 19:44:45 INFO dspy.evaluate.evaluate: Average Metric: 18.0 / 25 (72.0%)\n",
      "2025/01/15 19:44:45 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 72.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 4'].\n",
      "2025/01/15 19:44:45 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [60.0, 68.0, 72.0, 68.0, 72.0, 68.0, 68.0, 72.0, 84.0, 72.0, 76.0, 60.0, 76.0, 84.0, 76.0, 72.0]\n",
      "2025/01/15 19:44:45 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.33, 70.33]\n",
      "2025/01/15 19:44:45 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.33\n",
      "2025/01/15 19:44:45 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
      "\n",
      "\n",
      "2025/01/15 19:44:45 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 17 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 21.00 / 25 (84.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:35<00:00,  1.41s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 19:45:20 INFO dspy.evaluate.evaluate: Average Metric: 21 / 25 (84.0%)\n",
      "2025/01/15 19:45:20 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 84.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 13', 'Predictor 0: Few-Shot Set 10'].\n",
      "2025/01/15 19:45:20 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [60.0, 68.0, 72.0, 68.0, 72.0, 68.0, 68.0, 72.0, 84.0, 72.0, 76.0, 60.0, 76.0, 84.0, 76.0, 72.0, 84.0]\n",
      "2025/01/15 19:45:20 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.33, 70.33]\n",
      "2025/01/15 19:45:20 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.33\n",
      "2025/01/15 19:45:20 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
      "\n",
      "\n",
      "2025/01/15 19:45:20 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 18 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 18.00 / 25 (72.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:57<00:00,  2.31s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 19:46:18 INFO dspy.evaluate.evaluate: Average Metric: 18 / 25 (72.0%)\n",
      "2025/01/15 19:46:18 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 72.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 11'].\n",
      "2025/01/15 19:46:18 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [60.0, 68.0, 72.0, 68.0, 72.0, 68.0, 68.0, 72.0, 84.0, 72.0, 76.0, 60.0, 76.0, 84.0, 76.0, 72.0, 84.0, 72.0]\n",
      "2025/01/15 19:46:18 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.33, 70.33]\n",
      "2025/01/15 19:46:18 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.33\n",
      "2025/01/15 19:46:18 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
      "\n",
      "\n",
      "2025/01/15 19:46:18 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 19 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 18.00 / 25 (72.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:46<00:00,  1.87s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 19:47:05 INFO dspy.evaluate.evaluate: Average Metric: 18 / 25 (72.0%)\n",
      "2025/01/15 19:47:05 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 72.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 18', 'Predictor 0: Few-Shot Set 3'].\n",
      "2025/01/15 19:47:05 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [60.0, 68.0, 72.0, 68.0, 72.0, 68.0, 68.0, 72.0, 84.0, 72.0, 76.0, 60.0, 76.0, 84.0, 76.0, 72.0, 84.0, 72.0, 72.0]\n",
      "2025/01/15 19:47:05 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.33, 70.33]\n",
      "2025/01/15 19:47:05 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.33\n",
      "2025/01/15 19:47:05 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
      "\n",
      "\n",
      "2025/01/15 19:47:05 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 20 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 15.00 / 25 (60.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:53<00:00,  2.14s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 19:47:59 INFO dspy.evaluate.evaluate: Average Metric: 15 / 25 (60.0%)\n",
      "2025/01/15 19:47:59 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 60.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 6', 'Predictor 0: Few-Shot Set 4'].\n",
      "2025/01/15 19:47:59 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [60.0, 68.0, 72.0, 68.0, 72.0, 68.0, 68.0, 72.0, 84.0, 72.0, 76.0, 60.0, 76.0, 84.0, 76.0, 72.0, 84.0, 72.0, 72.0, 60.0]\n",
      "2025/01/15 19:47:59 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.33, 70.33]\n",
      "2025/01/15 19:47:59 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.33\n",
      "2025/01/15 19:47:59 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
      "\n",
      "\n",
      "2025/01/15 19:47:59 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Full Eval 2 =====\n",
      "2025/01/15 19:47:59 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 84.0) from minibatch trials...\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 106.00 / 147 (72.1%):  49%|██████████████████████████████████████████████████████▉                                                         | 147/300 [02:42<02:24,  1.06it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 19:50:42 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'The vertical tail propeller on a helicopter prevents the cab from rotating when the speed of the main propeller changes. Knowing that the cab has a centroidal moment of in-ertia of 700 lb \\\\textbullet ft \\\\textbullet sec^2 and that the four main blades are 14 ft slender rods weighing 55 lbs, determine the final angu-lar momentum of the cab after the speed of the main blades changes from 200 to 280 rpm. Assume the tail propeller of the helicopter to be inoperational.', 'options': ['A. 111.58 lb*ft*sec^2', 'B. 2980.6 lb*ft*sec', 'C. 357.75 lb*ft*sec^2', 'D. 1590.4 lb*ft*sec', 'E. 528.8 rpm', 'F. -31.15 rpm', 'G. 0 lb*ft*sec (no change in angular momentum)', 'H. -446.3 lb*ft*sec^2', 'I. 446.3 lb*ft*sec^2', 'J. 2283.2 lb*ft*sec'], 'answer': 'J'}) (input_keys={'options', 'question'}): Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']). Set `provide_traceback=True` to see the stack trace.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average Metric: 123.00 / 168 (73.2%):  56%|██████████████████████████████████████████████████████████████▋                                                 | 168/300 [03:02<02:29,  1.14s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 19:51:03 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'The isomer of C_2H_5OH(l) is CH_3 - O - CH_3(g) . Given that, at 25°C, the heat of formation of C_2H_5OH(l) = - 66 kcal/mole, the heat of combustion of the isomeric CH_3 - O - CH_3(g) to CO_2(g) and H_2O(l) = - 348 kcal/mole, the heat of formation of H_2O(l) = - 68 kcal/mole and the heat of combustion of carbon to CO_2(g) = - 94 kcal/mole. Calculate a) \\\\DeltaH_298°K for the reaction and b) \\\\DeltaE_298°K for the reaction assuming \\\\DeltaH_298°K in part a) = - 10 kcal.', 'options': ['A. \\\\DeltaH = 22 kcal, \\\\DeltaE = - 10.6 kcal', 'B. \\\\DeltaH = 204 kcal, \\\\DeltaE = - 204 kcal', 'C. \\\\DeltaH = 204 kcal, \\\\DeltaE = 204 kcal', 'D. \\\\DeltaH = 348 kcal, \\\\DeltaE = - 188 kcal', 'E. \\\\DeltaH = 66 kcal, \\\\DeltaE = - 10 kcal', 'F. \\\\DeltaH = -66 kcal, \\\\DeltaE = -10 kcal', 'G. \\\\DeltaH = 66 kcal, \\\\DeltaE = 10 kcal', 'H. \\\\DeltaH = -22 kcal, \\\\DeltaE = 10.6 kcal', 'I. \\\\DeltaH = -348 kcal, \\\\DeltaE = 188 kcal', 'J. \\\\DeltaH = 348 kcal, \\\\DeltaE = 10 kcal'], 'answer': 'A'}) (input_keys={'options', 'question'}): Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']). Set `provide_traceback=True` to see the stack trace.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average Metric: 212.00 / 298 (71.1%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [05:37<00:00,  1.13s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 19:53:36 INFO dspy.evaluate.evaluate: Average Metric: 212.0 / 300 (70.7%)\n",
      "2025/01/15 19:53:36 INFO dspy.teleprompt.mipro_optimizer_v2: \u001b[92mNew best full eval score!\u001b[0m Score: 70.67\n",
      "2025/01/15 19:53:36 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.33, 70.33, 70.67]\n",
      "2025/01/15 19:53:36 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.67\n",
      "2025/01/15 19:53:36 INFO dspy.teleprompt.mipro_optimizer_v2: =======================\n",
      "2025/01/15 19:53:36 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
      "\n",
      "2025/01/15 19:53:36 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 21 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 18.00 / 25 (72.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:30<00:00,  1.23s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 19:54:07 INFO dspy.evaluate.evaluate: Average Metric: 18 / 25 (72.0%)\n",
      "2025/01/15 19:54:07 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 72.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 16', 'Predictor 0: Few-Shot Set 15'].\n",
      "2025/01/15 19:54:07 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [60.0, 68.0, 72.0, 68.0, 72.0, 68.0, 68.0, 72.0, 84.0, 72.0, 76.0, 60.0, 76.0, 84.0, 76.0, 72.0, 84.0, 72.0, 72.0, 60.0, 72.0]\n",
      "2025/01/15 19:54:07 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.33, 70.33, 70.67]\n",
      "2025/01/15 19:54:07 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.67\n",
      "2025/01/15 19:54:07 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
      "\n",
      "\n",
      "2025/01/15 19:54:07 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 22 / 25 ==\n",
      "2025/01/15 19:54:07 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'The vertical tail propeller on a helicopter prevents the cab from rotating when the speed of the main propeller changes. Knowing that the cab has a centroidal moment of in-ertia of 700 lb \\\\textbullet ft \\\\textbullet sec^2 and that the four main blades are 14 ft slender rods weighing 55 lbs, determine the final angu-lar momentum of the cab after the speed of the main blades changes from 200 to 280 rpm. Assume the tail propeller of the helicopter to be inoperational.', 'options': ['A. 111.58 lb*ft*sec^2', 'B. 2980.6 lb*ft*sec', 'C. 357.75 lb*ft*sec^2', 'D. 1590.4 lb*ft*sec', 'E. 528.8 rpm', 'F. -31.15 rpm', 'G. 0 lb*ft*sec (no change in angular momentum)', 'H. -446.3 lb*ft*sec^2', 'I. 446.3 lb*ft*sec^2', 'J. 2283.2 lb*ft*sec'], 'answer': 'J'}) (input_keys={'options', 'question'}): Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']). Set `provide_traceback=True` to see the stack trace.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 21.00 / 24 (87.5%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:00<00:00, 1667.18it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 19:54:08 INFO dspy.evaluate.evaluate: Average Metric: 21.0 / 25 (84.0%)\n",
      "2025/01/15 19:54:08 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 84.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 13', 'Predictor 0: Few-Shot Set 10'].\n",
      "2025/01/15 19:54:08 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [60.0, 68.0, 72.0, 68.0, 72.0, 68.0, 68.0, 72.0, 84.0, 72.0, 76.0, 60.0, 76.0, 84.0, 76.0, 72.0, 84.0, 72.0, 72.0, 60.0, 72.0, 84.0]\n",
      "2025/01/15 19:54:08 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.33, 70.33, 70.67]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 19:54:08 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.67\n",
      "2025/01/15 19:54:08 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
      "\n",
      "\n",
      "2025/01/15 19:54:08 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 23 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average Metric: 17.00 / 25 (68.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:45<00:00,  1.81s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 19:54:53 INFO dspy.evaluate.evaluate: Average Metric: 17 / 25 (68.0%)\n",
      "2025/01/15 19:54:53 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 68.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 9'].\n",
      "2025/01/15 19:54:53 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [60.0, 68.0, 72.0, 68.0, 72.0, 68.0, 68.0, 72.0, 84.0, 72.0, 76.0, 60.0, 76.0, 84.0, 76.0, 72.0, 84.0, 72.0, 72.0, 60.0, 72.0, 84.0, 68.0]\n",
      "2025/01/15 19:54:53 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.33, 70.33, 70.67]\n",
      "2025/01/15 19:54:53 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.67\n",
      "2025/01/15 19:54:53 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
      "\n",
      "\n",
      "2025/01/15 19:54:53 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 24 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 2.00 / 25 (8.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [01:02<00:00,  2.51s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 19:55:56 INFO dspy.evaluate.evaluate: Average Metric: 2 / 25 (8.0%)\n",
      "2025/01/15 19:55:56 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 8.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 13', 'Predictor 0: Few-Shot Set 0'].\n",
      "2025/01/15 19:55:56 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [60.0, 68.0, 72.0, 68.0, 72.0, 68.0, 68.0, 72.0, 84.0, 72.0, 76.0, 60.0, 76.0, 84.0, 76.0, 72.0, 84.0, 72.0, 72.0, 60.0, 72.0, 84.0, 68.0, 8.0]\n",
      "2025/01/15 19:55:56 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.33, 70.33, 70.67]\n",
      "2025/01/15 19:55:56 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.67\n",
      "2025/01/15 19:55:56 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
      "\n",
      "\n",
      "2025/01/15 19:55:56 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 25 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 20.00 / 25 (80.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:54<00:00,  2.20s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 19:56:51 INFO dspy.evaluate.evaluate: Average Metric: 20 / 25 (80.0%)\n",
      "2025/01/15 19:56:51 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 80.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 9', 'Predictor 0: Few-Shot Set 10'].\n",
      "2025/01/15 19:56:51 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [60.0, 68.0, 72.0, 68.0, 72.0, 68.0, 68.0, 72.0, 84.0, 72.0, 76.0, 60.0, 76.0, 84.0, 76.0, 72.0, 84.0, 72.0, 72.0, 60.0, 72.0, 84.0, 68.0, 8.0, 80.0]\n",
      "2025/01/15 19:56:51 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.33, 70.33, 70.67]\n",
      "2025/01/15 19:56:51 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 70.67\n",
      "2025/01/15 19:56:51 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
      "\n",
      "\n",
      "2025/01/15 19:56:51 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Full Eval 3 =====\n",
      "2025/01/15 19:56:51 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 80.0) from minibatch trials...\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 116.00 / 163 (71.2%):  54%|████████████████████████████████████████████████████████████▊                                                   | 163/300 [02:56<02:19,  1.02s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 19:59:49 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'The isomer of C_2H_5OH(l) is CH_3 - O - CH_3(g) . Given that, at 25°C, the heat of formation of C_2H_5OH(l) = - 66 kcal/mole, the heat of combustion of the isomeric CH_3 - O - CH_3(g) to CO_2(g) and H_2O(l) = - 348 kcal/mole, the heat of formation of H_2O(l) = - 68 kcal/mole and the heat of combustion of carbon to CO_2(g) = - 94 kcal/mole. Calculate a) \\\\DeltaH_298°K for the reaction and b) \\\\DeltaE_298°K for the reaction assuming \\\\DeltaH_298°K in part a) = - 10 kcal.', 'options': ['A. \\\\DeltaH = 22 kcal, \\\\DeltaE = - 10.6 kcal', 'B. \\\\DeltaH = 204 kcal, \\\\DeltaE = - 204 kcal', 'C. \\\\DeltaH = 204 kcal, \\\\DeltaE = 204 kcal', 'D. \\\\DeltaH = 348 kcal, \\\\DeltaE = - 188 kcal', 'E. \\\\DeltaH = 66 kcal, \\\\DeltaE = - 10 kcal', 'F. \\\\DeltaH = -66 kcal, \\\\DeltaE = -10 kcal', 'G. \\\\DeltaH = 66 kcal, \\\\DeltaE = 10 kcal', 'H. \\\\DeltaH = -22 kcal, \\\\DeltaE = 10.6 kcal', 'I. \\\\DeltaH = -348 kcal, \\\\DeltaE = 188 kcal', 'J. \\\\DeltaH = 348 kcal, \\\\DeltaE = 10 kcal'], 'answer': 'A'}) (input_keys={'options', 'question'}): Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']). Set `provide_traceback=True` to see the stack trace.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average Metric: 217.00 / 298 (72.8%): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▋| 299/300 [05:03<00:02,  2.51s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 20:02:20 ERROR dspy.utils.parallelizer: Error processing item Example({'question': \"A 'fishbowl' of height 4r/3 is formed by removing the top third of a sphere of radius r=6. The fishbowl is fixed in sand so that its rim is parallel with the ground. A small marble of mass m rests at the bottom of the fishbowl. Assuming all surfaces are frictionless and ignoring air resistance, find the maximum initial velocity that could be given to the marble for it to land back in the fishbowl with g=9.8.\", 'options': ['A. 20.5', 'B. 24.0', 'C. 16.4', 'D. 22.3', 'E. 18.25', 'F. 23.1', 'G. 21.7', 'H. 19.6', 'I. 17.0', 'J. 15.75'], 'answer': 'E'}) (input_keys={'options', 'question'}): Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']). Set `provide_traceback=True` to see the stack trace.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average Metric: 217.00 / 298 (72.8%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [05:29<00:00,  1.10s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/15 20:02:20 INFO dspy.evaluate.evaluate: Average Metric: 217.0 / 300 (72.3%)\n",
      "2025/01/15 20:02:20 INFO dspy.teleprompt.mipro_optimizer_v2: \u001b[92mNew best full eval score!\u001b[0m Score: 72.33\n",
      "2025/01/15 20:02:20 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [16.33, 70.33, 70.67, 72.33]\n",
      "2025/01/15 20:02:20 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 72.33\n",
      "2025/01/15 20:02:20 INFO dspy.teleprompt.mipro_optimizer_v2: =======================\n",
      "2025/01/15 20:02:20 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
      "\n",
      "2025/01/15 20:02:20 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 72.33!\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "subset_size = 500\n",
    "optimizer = dspy.MIPROv2(\n",
    "    metric=benchmark.metric,\n",
    "    auto=\"medium\",\n",
    "    num_threads=NUM_THREADS,\n",
    "    task_model=TASK_MODEL,\n",
    "    prompt_model=PROMPT_MODEL,\n",
    "    max_labeled_demos=FEW_SHOTS,\n",
    ")\n",
    "\n",
    "optimized_program = optimizer.compile(\n",
    "    program,\n",
    "    trainset=trainset[:subset_size],\n",
    "    valset=valset[:subset_size],\n",
    "    requires_permission_to_run=False,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "BEST PROMPT:\n",
      " You are a skilled engineer and problem solver. Given the fields `question`, `options`, produce the fields `reasoning`, `answer` by carefully analyzing the problem, applying relevant formulas and principles, and selecting the most appropriate answer based on your calculations and logical deductions. Ensure your reasoning is step-by-step and clearly explains how you arrived at your answer.\n"
     ]
    }
   ],
   "source": [
    "print(\"BEST PROMPT:\\n\", optimized_program.predict.signature.instructions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average Metric: 158.00 / 212 (74.5%):  42%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                                                                                 | 212/500 [03:52<05:18,  1.11s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/16 10:35:33 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'A microcomputer used for data acquisition and control is required to digitize and process four analog input signals and to output their average continually; i.e., in real time. The time for an external analog-to-digital converter (which is triggered by a CPU instruction) to digitize one input is 12 microseconds, and only one digitization occurs at a time. Five CPU instructions, including the triggering instruction, must be executed for each signal digitized. Ten CPU instructions are executed in order to average each set of four samples and output this value. The time to convert the output from digital to analog form is to be ignored. If it is assumed that suitable data buffering is employed, then the maximum average instruction execution time that allows the microcomputer to keep up with the input-output data rates, is', 'options': ['A. 3.0 microseconds', 'B. 2.6 microseconds', 'C. 2.4 microseconds', 'D. 2.0 microseconds', 'E. 0.6 microseconds', 'F. 1.6 microseconds', 'G. 0.8 microseconds', 'H. 1.8 microseconds', 'I. 1.2 microseconds', 'J. 1.0 microseconds'], 'answer': 'F'}) (input_keys={'options', 'question'}): Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']). Set `provide_traceback=True` to see the stack trace.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average Metric: 174.00 / 234 (74.4%):  47%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                                      | 235/500 [04:17<04:51,  1.10s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/16 10:35:58 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'A 34 kg steel casting at a temperature of 427°C is quenched in136 kg of oil initially at 21°C. Assuming no heat losses and thesteel casting and oil to have constant specific heats of 0.5024and2.5121 kJ/kg-°K respectively, determine the - changein entropy for a system consisting of the oil and casting.', 'options': ['A. 11.99 kJ/°K', 'B. 13.99 kJ/°K', 'C. 6.99 kJ/°K', 'D. 10.99 kJ/°K', 'E. 5.99 kJ/°K', 'F. 9.99 kJ/°K', 'G. 4.99 kJ/°K', 'H. 8.99 kJ/°K', 'I. 12.99 kJ/°K', 'J. 7.99 kJ/°K'], 'answer': 'J'}) (input_keys={'options', 'question'}): Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']). Set `provide_traceback=True` to see the stack trace.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average Metric: 241.00 / 329 (73.3%):  66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                      | 330/500 [06:11<03:36,  1.27s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/16 10:37:50 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'Consider the discrete memoryless channel $Y=XZ$ where $X$ and $Z$ are independent binary random variables that take on values 0 and 1. Let $P(Z=1)=0.5$. Find the capacity of this channel in bits.', 'options': ['A. 1.322', 'B. 0.500', 'C. 0.700', 'D. 0.100', 'E. 0.750', 'F. 1.000', 'G. 0.585', 'H. 0.322', 'I. 0.250', 'J. 0.811'], 'answer': 'H'}) (input_keys={'options', 'question'}): Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']). Set `provide_traceback=True` to see the stack trace.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average Metric: 257.00 / 350 (73.4%):  71%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                          | 353/500 [06:33<03:56,  1.61s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/16 10:38:13 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'A neutron at rest decays (breaks up) to a proton and an electron. Energy is released in the decay and appears as kinetic energy of the proton and electron. The mass of a proton is 1836 times the mass of an electron. What fraction of the total energy released goes into the kinetic energy of the proton?', 'options': ['A. 0.1', 'B. 0.99', 'C. 0.5', 'D. 0.000544', 'E. 0.544', 'F. 0.00544', 'G. 0.1836', 'H. 0.0000918', 'I. 0.0544', 'J. 0.9180'], 'answer': 'D'}) (input_keys={'options', 'question'}): Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']). Set `provide_traceback=True` to see the stack trace.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average Metric: 303.00 / 416 (72.8%):  84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                        | 420/500 [07:47<01:18,  1.02it/s]"
     ]
    },
    {
     "ename": "AttributeError",
     "evalue": "'list' object has no attribute 'items'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "File \u001b[0;32m~/miniconda3/envs/prompt-migration/lib/python3.10/site-packages/dspy/adapters/base.py:30\u001b[0m, in \u001b[0;36mAdapter.__call__\u001b[0;34m(self, lm, lm_kwargs, signature, demos, inputs)\u001b[0m\n\u001b[1;32m     28\u001b[0m     output, output_logprobs \u001b[38;5;241m=\u001b[39m output[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtext\u001b[39m\u001b[38;5;124m\"\u001b[39m], output[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlogprobs\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[0;32m---> 30\u001b[0m value \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mparse\u001b[49m\u001b[43m(\u001b[49m\u001b[43msignature\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutput\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     32\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mset\u001b[39m(value\u001b[38;5;241m.\u001b[39mkeys()) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mset\u001b[39m(signature\u001b[38;5;241m.\u001b[39moutput_fields\u001b[38;5;241m.\u001b[39mkeys()), \\\n\u001b[1;32m     33\u001b[0m     \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mExpected \u001b[39m\u001b[38;5;132;01m{\u001b[39;00msignature\u001b[38;5;241m.\u001b[39moutput_fields\u001b[38;5;241m.\u001b[39mkeys()\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m but got \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mvalue\u001b[38;5;241m.\u001b[39mkeys()\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n",
      "File \u001b[0;32m~/miniconda3/envs/prompt-migration/lib/python3.10/site-packages/dspy/utils/callback.py:234\u001b[0m, in \u001b[0;36mwith_callbacks.<locals>.wrapper\u001b[0;34m(instance, *args, **kwargs)\u001b[0m\n\u001b[1;32m    233\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m callbacks:\n\u001b[0;32m--> 234\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[43minstance\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    236\u001b[0m \u001b[38;5;66;03m# Generate call ID as the unique identifier for the call, this is useful for instrumentation.\u001b[39;00m\n",
      "File \u001b[0;32m~/miniconda3/envs/prompt-migration/lib/python3.10/site-packages/dspy/adapters/chat_adapter.py:84\u001b[0m, in \u001b[0;36mChatAdapter.parse\u001b[0;34m(self, signature, completion)\u001b[0m\n\u001b[1;32m     83\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m fields\u001b[38;5;241m.\u001b[39mkeys() \u001b[38;5;241m!=\u001b[39m signature\u001b[38;5;241m.\u001b[39moutput_fields\u001b[38;5;241m.\u001b[39mkeys():\n\u001b[0;32m---> 84\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mExpected \u001b[39m\u001b[38;5;132;01m{\u001b[39;00msignature\u001b[38;5;241m.\u001b[39moutput_fields\u001b[38;5;241m.\u001b[39mkeys()\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m but got \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfields\u001b[38;5;241m.\u001b[39mkeys()\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m     86\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m fields\n",
      "\u001b[0;31mValueError\u001b[0m: Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning'])",
      "\nDuring handling of the above exception, another exception occurred:\n",
      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[41], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m score, results, all_scores \u001b[38;5;241m=\u001b[39m \u001b[43mevaluate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m      2\u001b[0m \u001b[43m    \u001b[49m\u001b[43moptimized_program\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m      3\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdevset\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtestset\u001b[49m\u001b[43m[\u001b[49m\u001b[43m:\u001b[49m\u001b[43msubset_size\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m      4\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdisplay_table\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m      5\u001b[0m \u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/miniconda3/envs/prompt-migration/lib/python3.10/site-packages/dspy/evaluate/evaluate.py:160\u001b[0m, in \u001b[0;36mEvaluate.__call__\u001b[0;34m(self, program, metric, devset, num_threads, display_progress, display_table, return_all_scores, return_outputs)\u001b[0m\n\u001b[1;32m    156\u001b[0m         program\u001b[38;5;241m.\u001b[39m_suggest_failures \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m dspy\u001b[38;5;241m.\u001b[39msettings\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msuggest_failures\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m    158\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m prediction, score\n\u001b[0;32m--> 160\u001b[0m results \u001b[38;5;241m=\u001b[39m \u001b[43mexecutor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexecute\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprocess_item\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdevset\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    161\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(devset) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mlen\u001b[39m(results)\n\u001b[1;32m    163\u001b[0m results \u001b[38;5;241m=\u001b[39m [((dspy\u001b[38;5;241m.\u001b[39mPrediction(), \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfailure_score) \u001b[38;5;28;01mif\u001b[39;00m r \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m r) \u001b[38;5;28;01mfor\u001b[39;00m r \u001b[38;5;129;01min\u001b[39;00m results]\n",
      "File \u001b[0;32m~/miniconda3/envs/prompt-migration/lib/python3.10/site-packages/dspy/utils/parallelizer.py:39\u001b[0m, in \u001b[0;36mParallelExecutor.execute\u001b[0;34m(self, function, data)\u001b[0m\n\u001b[1;32m     37\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_execute_isolated_single_thread(wrapped_function, data)\n\u001b[1;32m     38\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 39\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_execute_multi_thread\u001b[49m\u001b[43m(\u001b[49m\u001b[43mwrapped_function\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/miniconda3/envs/prompt-migration/lib/python3.10/site-packages/dspy/utils/parallelizer.py:180\u001b[0m, in \u001b[0;36mParallelExecutor._execute_multi_thread\u001b[0;34m(self, function, data)\u001b[0m\n\u001b[1;32m    172\u001b[0m pbar \u001b[38;5;241m=\u001b[39m tqdm\u001b[38;5;241m.\u001b[39mtqdm(\n\u001b[1;32m    173\u001b[0m     total\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mlen\u001b[39m(data),\n\u001b[1;32m    174\u001b[0m     dynamic_ncols\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m    175\u001b[0m     disable\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdisable_progress_bar,\n\u001b[1;32m    176\u001b[0m     file\u001b[38;5;241m=\u001b[39msys\u001b[38;5;241m.\u001b[39mstdout\n\u001b[1;32m    177\u001b[0m )\n\u001b[1;32m    179\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m future \u001b[38;5;129;01min\u001b[39;00m as_completed(futures):\n\u001b[0;32m--> 180\u001b[0m     index, result \u001b[38;5;241m=\u001b[39m \u001b[43mfuture\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresult\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    182\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m result \u001b[38;5;129;01mis\u001b[39;00m job_cancelled:\n\u001b[1;32m    183\u001b[0m         \u001b[38;5;28;01mcontinue\u001b[39;00m\n",
      "File \u001b[0;32m~/miniconda3/envs/prompt-migration/lib/python3.10/concurrent/futures/_base.py:451\u001b[0m, in \u001b[0;36mFuture.result\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    449\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m CancelledError()\n\u001b[1;32m    450\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_state \u001b[38;5;241m==\u001b[39m FINISHED:\n\u001b[0;32m--> 451\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__get_result\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    453\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_condition\u001b[38;5;241m.\u001b[39mwait(timeout)\n\u001b[1;32m    455\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_state \u001b[38;5;129;01min\u001b[39;00m [CANCELLED, CANCELLED_AND_NOTIFIED]:\n",
      "File \u001b[0;32m~/miniconda3/envs/prompt-migration/lib/python3.10/concurrent/futures/_base.py:403\u001b[0m, in \u001b[0;36mFuture.__get_result\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    401\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_exception:\n\u001b[1;32m    402\u001b[0m     \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 403\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_exception\n\u001b[1;32m    404\u001b[0m     \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m    405\u001b[0m         \u001b[38;5;66;03m# Break a reference cycle with the exception in self._exception\u001b[39;00m\n\u001b[1;32m    406\u001b[0m         \u001b[38;5;28mself\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
      "File \u001b[0;32m~/miniconda3/envs/prompt-migration/lib/python3.10/concurrent/futures/thread.py:58\u001b[0m, in \u001b[0;36m_WorkItem.run\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m     55\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[1;32m     57\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m---> 58\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     59\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mBaseException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[1;32m     60\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfuture\u001b[38;5;241m.\u001b[39mset_exception(exc)\n",
      "File \u001b[0;32m~/miniconda3/envs/prompt-migration/lib/python3.10/site-packages/dspy/utils/parallelizer.py:158\u001b[0m, in \u001b[0;36mParallelExecutor._execute_multi_thread.<locals>.cancellable_function\u001b[0;34m(parent_overrides, index_item)\u001b[0m\n\u001b[1;32m    155\u001b[0m thread_local_overrides\u001b[38;5;241m.\u001b[39moverrides \u001b[38;5;241m=\u001b[39m parent_overrides\u001b[38;5;241m.\u001b[39mcopy()\n\u001b[1;32m    157\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 158\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m index, \u001b[43mfunction\u001b[49m\u001b[43m(\u001b[49m\u001b[43mitem\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    159\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m    160\u001b[0m     thread_local_overrides\u001b[38;5;241m.\u001b[39moverrides \u001b[38;5;241m=\u001b[39m original_overrides\n",
      "File \u001b[0;32m~/miniconda3/envs/prompt-migration/lib/python3.10/site-packages/dspy/utils/parallelizer.py:54\u001b[0m, in \u001b[0;36mParallelExecutor._wrap_function.<locals>.wrapped\u001b[0;34m(item)\u001b[0m\n\u001b[1;32m     52\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m current_error_count \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmax_errors:\n\u001b[1;32m     53\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcancel_jobs\u001b[38;5;241m.\u001b[39mset()\n\u001b[0;32m---> 54\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m e\n\u001b[1;32m     55\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprovide_traceback:\n\u001b[1;32m     56\u001b[0m     logger\u001b[38;5;241m.\u001b[39merror(\n\u001b[1;32m     57\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mError processing item \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mitem\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mStack trace:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;132;01m{\u001b[39;00mtraceback\u001b[38;5;241m.\u001b[39mformat_exc()\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     58\u001b[0m     )\n",
      "File \u001b[0;32m~/miniconda3/envs/prompt-migration/lib/python3.10/site-packages/dspy/utils/parallelizer.py:47\u001b[0m, in \u001b[0;36mParallelExecutor._wrap_function.<locals>.wrapped\u001b[0;34m(item)\u001b[0m\n\u001b[1;32m     45\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m     46\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m---> 47\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunction\u001b[49m\u001b[43m(\u001b[49m\u001b[43mitem\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     48\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m     49\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39merror_lock:\n",
      "File \u001b[0;32m~/miniconda3/envs/prompt-migration/lib/python3.10/site-packages/dspy/evaluate/evaluate.py:149\u001b[0m, in \u001b[0;36mEvaluate.__call__.<locals>.process_item\u001b[0;34m(example)\u001b[0m\n\u001b[1;32m    148\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mprocess_item\u001b[39m(example):\n\u001b[0;32m--> 149\u001b[0m     prediction \u001b[38;5;241m=\u001b[39m \u001b[43mprogram\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mexample\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minputs\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    150\u001b[0m     score \u001b[38;5;241m=\u001b[39m metric(example, prediction)\n\u001b[1;32m    152\u001b[0m     \u001b[38;5;66;03m# Increment assert and suggest failures to program's attributes\u001b[39;00m\n",
      "File \u001b[0;32m~/miniconda3/envs/prompt-migration/lib/python3.10/site-packages/dspy/utils/callback.py:234\u001b[0m, in \u001b[0;36mwith_callbacks.<locals>.wrapper\u001b[0;34m(instance, *args, **kwargs)\u001b[0m\n\u001b[1;32m    232\u001b[0m \u001b[38;5;66;03m# If no callbacks are provided, just call the function\u001b[39;00m\n\u001b[1;32m    233\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m callbacks:\n\u001b[0;32m--> 234\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[43minstance\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    236\u001b[0m \u001b[38;5;66;03m# Generate call ID as the unique identifier for the call, this is useful for instrumentation.\u001b[39;00m\n\u001b[1;32m    237\u001b[0m call_id \u001b[38;5;241m=\u001b[39m uuid\u001b[38;5;241m.\u001b[39muuid4()\u001b[38;5;241m.\u001b[39mhex\n",
      "File \u001b[0;32m~/miniconda3/envs/prompt-migration/lib/python3.10/site-packages/dspy/primitives/program.py:22\u001b[0m, in \u001b[0;36mModule.__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m     20\u001b[0m \u001b[38;5;129m@with_callbacks\u001b[39m\n\u001b[1;32m     21\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m---> 22\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mforward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/miniconda3/envs/prompt-migration/lib/python3.10/site-packages/dspy/predict/chain_of_thought.py:20\u001b[0m, in \u001b[0;36mChainOfThought.forward\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m     19\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m---> 20\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpredict\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/miniconda3/envs/prompt-migration/lib/python3.10/site-packages/dspy/utils/callback.py:234\u001b[0m, in \u001b[0;36mwith_callbacks.<locals>.wrapper\u001b[0;34m(instance, *args, **kwargs)\u001b[0m\n\u001b[1;32m    232\u001b[0m \u001b[38;5;66;03m# If no callbacks are provided, just call the function\u001b[39;00m\n\u001b[1;32m    233\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m callbacks:\n\u001b[0;32m--> 234\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[43minstance\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    236\u001b[0m \u001b[38;5;66;03m# Generate call ID as the unique identifier for the call, this is useful for instrumentation.\u001b[39;00m\n\u001b[1;32m    237\u001b[0m call_id \u001b[38;5;241m=\u001b[39m uuid\u001b[38;5;241m.\u001b[39muuid4()\u001b[38;5;241m.\u001b[39mhex\n",
      "File \u001b[0;32m~/miniconda3/envs/prompt-migration/lib/python3.10/site-packages/dspy/predict/predict.py:81\u001b[0m, in \u001b[0;36mPredict.__call__\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m     79\u001b[0m \u001b[38;5;129m@with_callbacks\u001b[39m\n\u001b[1;32m     80\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m---> 81\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mforward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/miniconda3/envs/prompt-migration/lib/python3.10/site-packages/dspy/predict/predict.py:111\u001b[0m, in \u001b[0;36mPredict.forward\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m    109\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mdspy\u001b[39;00m\n\u001b[1;32m    110\u001b[0m adapter \u001b[38;5;241m=\u001b[39m dspy\u001b[38;5;241m.\u001b[39msettings\u001b[38;5;241m.\u001b[39madapter \u001b[38;5;129;01mor\u001b[39;00m dspy\u001b[38;5;241m.\u001b[39mChatAdapter()\n\u001b[0;32m--> 111\u001b[0m completions \u001b[38;5;241m=\u001b[39m \u001b[43madapter\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlm\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlm_kwargs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msignature\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msignature\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdemos\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdemos\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    113\u001b[0m pred \u001b[38;5;241m=\u001b[39m Prediction\u001b[38;5;241m.\u001b[39mfrom_completions(completions, signature\u001b[38;5;241m=\u001b[39msignature)\n\u001b[1;32m    115\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m kwargs\u001b[38;5;241m.\u001b[39mpop(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_trace\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mTrue\u001b[39;00m) \u001b[38;5;129;01mand\u001b[39;00m dspy\u001b[38;5;241m.\u001b[39msettings\u001b[38;5;241m.\u001b[39mtrace \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
      "File \u001b[0;32m~/miniconda3/envs/prompt-migration/lib/python3.10/site-packages/dspy/adapters/base.py:45\u001b[0m, in \u001b[0;36mAdapter.__call__\u001b[0;34m(self, lm, lm_kwargs, signature, demos, inputs)\u001b[0m\n\u001b[1;32m     43\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mjson_adapter\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m JSONAdapter\n\u001b[1;32m     44\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\u001b[38;5;28mself\u001b[39m, JSONAdapter):\n\u001b[0;32m---> 45\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mJSONAdapter\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlm\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlm_kwargs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msignature\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdemos\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     46\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e\n",
      "File \u001b[0;32m~/miniconda3/envs/prompt-migration/lib/python3.10/site-packages/dspy/adapters/json_adapter.py:61\u001b[0m, in \u001b[0;36mJSONAdapter.__call__\u001b[0;34m(self, lm, lm_kwargs, signature, demos, inputs)\u001b[0m\n\u001b[1;32m     58\u001b[0m values \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m     60\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m output \u001b[38;5;129;01min\u001b[39;00m outputs:\n\u001b[0;32m---> 61\u001b[0m     value \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mparse\u001b[49m\u001b[43m(\u001b[49m\u001b[43msignature\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutput\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     62\u001b[0m     \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mset\u001b[39m(value\u001b[38;5;241m.\u001b[39mkeys()) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mset\u001b[39m(\n\u001b[1;32m     63\u001b[0m         signature\u001b[38;5;241m.\u001b[39moutput_fields\u001b[38;5;241m.\u001b[39mkeys()\n\u001b[1;32m     64\u001b[0m     ), \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mExpected \u001b[39m\u001b[38;5;132;01m{\u001b[39;00msignature\u001b[38;5;241m.\u001b[39moutput_fields\u001b[38;5;241m.\u001b[39mkeys()\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m but got \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mvalue\u001b[38;5;241m.\u001b[39mkeys()\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     65\u001b[0m     values\u001b[38;5;241m.\u001b[39mappend(value)\n",
      "File \u001b[0;32m~/miniconda3/envs/prompt-migration/lib/python3.10/site-packages/dspy/utils/callback.py:234\u001b[0m, in \u001b[0;36mwith_callbacks.<locals>.wrapper\u001b[0;34m(instance, *args, **kwargs)\u001b[0m\n\u001b[1;32m    232\u001b[0m \u001b[38;5;66;03m# If no callbacks are provided, just call the function\u001b[39;00m\n\u001b[1;32m    233\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m callbacks:\n\u001b[0;32m--> 234\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[43minstance\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    236\u001b[0m \u001b[38;5;66;03m# Generate call ID as the unique identifier for the call, this is useful for instrumentation.\u001b[39;00m\n\u001b[1;32m    237\u001b[0m call_id \u001b[38;5;241m=\u001b[39m uuid\u001b[38;5;241m.\u001b[39muuid4()\u001b[38;5;241m.\u001b[39mhex\n",
      "File \u001b[0;32m~/miniconda3/envs/prompt-migration/lib/python3.10/site-packages/dspy/adapters/json_adapter.py:95\u001b[0m, in \u001b[0;36mJSONAdapter.parse\u001b[0;34m(self, signature, completion)\u001b[0m\n\u001b[1;32m     93\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mparse\u001b[39m(\u001b[38;5;28mself\u001b[39m, signature, completion):\n\u001b[1;32m     94\u001b[0m     fields \u001b[38;5;241m=\u001b[39m json_repair\u001b[38;5;241m.\u001b[39mloads(completion)\n\u001b[0;32m---> 95\u001b[0m     fields \u001b[38;5;241m=\u001b[39m {k: v \u001b[38;5;28;01mfor\u001b[39;00m k, v \u001b[38;5;129;01min\u001b[39;00m \u001b[43mfields\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mitems\u001b[49m() \u001b[38;5;28;01mif\u001b[39;00m k \u001b[38;5;129;01min\u001b[39;00m signature\u001b[38;5;241m.\u001b[39moutput_fields}\n\u001b[1;32m     97\u001b[0m     \u001b[38;5;66;03m# attempt to cast each value to type signature.output_fields[k].annotation\u001b[39;00m\n\u001b[1;32m     98\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m k, v \u001b[38;5;129;01min\u001b[39;00m fields\u001b[38;5;241m.\u001b[39mitems():\n",
      "\u001b[0;31mAttributeError\u001b[0m: 'list' object has no attribute 'items'"
     ]
    }
   ],
   "source": [
    "score, results, all_scores = evaluate(\n",
    "    optimized_program,\n",
    "    devset=testset[:subset_size],\n",
    "    display_table=False,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Heavy Optimization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "optimizer = dspy.MIPROv2(\n",
    "    metric=benchmark.metric,\n",
    "    auto=\"heavy\",\n",
    "    num_threads=NUM_THREADS,\n",
    "    task_model=TASK_MODEL,\n",
    "    prompt_model=PROMPT_MODEL,\n",
    "    max_labeled_demos=FEW_SHOTS,\n",
    ")\n",
    "\n",
    "optimized_program = optimizer.compile(\n",
    "    program,\n",
    "    trainset=trainset,\n",
    "    valset=valset,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"BEST PROMPT:\\n\", optimized_program.predict.signature.instructions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "score, results, all_scores = evaluate(\n",
    "    optimized_program,\n",
    "    devset=testset,\n",
    "    display_table=False,\n",
    ")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}