{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Imports & Env Setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/justinai/.conda/envs/prompt-migration/lib/python3.10/site-packages/pydantic/_internal/_config.py:345: UserWarning: Valid config keys have changed in V2:\n",
      "* 'fields' has been removed\n",
      "  warnings.warn(message, UserWarning)\n"
     ]
    }
   ],
   "source": [
    "%reload_ext autoreload\n",
    "%autoreload 2\n",
    "import sys\n",
    "import os\n",
    "from dotenv import load_dotenv\n",
    "load_dotenv()\n",
    "\n",
    "import dspy\n",
    "sys.path.append(os.path.abspath('../'))\n",
    "from benchmarks import llama_mmlu_pro, leaderboard_mmlu_pro"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Configuration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "NUM_THREADS = 16\n",
    "\n",
    "FEW_SHOTS = 5\n",
    "\n",
    "# See https://docs.litellm.ai/docs/providers/vllm for details\n",
    "TASK_MODEL = dspy.LM(\n",
    "    \"hosted_vllm/meta-llama/Llama-3.3-70B-Instruct\",\n",
    "    api_base = 'http://localhost:8000/v1' , # or api_base ?\n",
    "    # api_version: Optional[str] = None,\n",
    "    # api_key: Optional[str] = None,\n",
    "    # seed: Optional[int] = None,\n",
    "    # max_tokens: Optional[int] = None,\n",
    "    # timeout: Optional[Union[float, int]] = None,\n",
    ")\n",
    "PROMPT_MODEL = dspy.LM(\n",
    "    \"hosted_vllm/meta-llama/Llama-3.3-70B-Instruct\",\n",
    "    api_base = 'http://localhost:8000/v1', # or api_base ?\n",
    "    # api_version: Optional[str] = None,\n",
    "    # api_key: Optional[str] = None,\n",
    "    # seed: Optional[int] = None,\n",
    "    # max_tokens: Optional[int] = None,\n",
    "    # timeout: Optional[Union[float, int]] = None,\n",
    ")\n",
    "\n",
    "dspy.configure(lm=TASK_MODEL)\n",
    "\n",
    "# replace this with llama_mmlu_pro or whatever\n",
    "benchmark = llama_mmlu_pro\n",
    "\n",
    "# Without chain of thought:\n",
    "# program = dspy.Predict(\n",
    "#     benchmark.signature(\"\")\n",
    "# )\n",
    "\n",
    "# With chain of thought:\n",
    "program = dspy.ChainOfThought(\n",
    "    benchmark.signature(\"You are a helpful assistant designed to help with multiple choice question. Always return a JSON object with the following format:\\n\"\n",
    "        \"{\\n\"\n",
    "        '  \"reasoning\": \"Step-by-step reasoning here.\",\\n'\n",
    "        '  \"answer\": \"Final answer (A, B, C, etc.)\"\\n'\n",
    "        \"}\\n\"\n",
    "        \"Do NOT return plain text. Only return a valid JSON object with these keys.\") # put your initial system prompt here, or leave blank\n",
    ")\n",
    "\n",
    "evaluate = dspy.Evaluate(\n",
    "    devset=[],\n",
    "    metric=benchmark.metric,\n",
    "    num_threads=NUM_THREADS,\n",
    "    display_progress=True,\n",
    "    display_table=True,\n",
    "    return_all_scores=True,\n",
    "    return_outputs=True,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1197, 2156, 8626)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trainset, valset, testset = benchmark.datasets(\n",
    "    train_size=0.1,\n",
    "    validation_size=0.2,\n",
    ")\n",
    "\n",
    "len(trainset), len(valset), len(testset)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Baseline Benchmark"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "BASE PROMPT:\n",
      " Multiple choice question answering with reasoning.\n",
      "CPU times: user 170 μs, sys: 21 μs, total: 191 μs\n",
      "Wall time: 171 μs\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "print(\"BASE PROMPT:\\n\", program.predict.signature.instructions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Starting execution...\n",
      "Average Metric: 0.00 / 101 (0.0%):   1%|▉                                                                                   | 101/8626 [00:57<1:31:30,  1.55it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/22 00:51:43 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'A compressible gas flows over a flat plate. Properties of the gasare closely similar to those of air. The flow is at a temperatureand pressure of 700°F and 30psia, respectively. The plate is 1 in. in length and is assumed to beperfectly insulated. If the gas is moving at a speed of 500 ft/sec, calculate the surface temperature of the plate. (Note: the speed is too great to neglect the effects of viscous dissipation.)', 'options': {'A': '700.5°F', 'B': '700°F', 'C': '780°F', 'D': '772°F', 'E': '735°F', 'F': '800°F', 'G': '750°F', 'H': '685°F', 'I': '716.25°F', 'J': '810°F'}, 'reasoning': '', 'answer': 'I'}) (input_keys={'question', 'options'}): Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']). Set `provide_traceback=True` to see the stack trace.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average Metric: 0.00 / 116 (0.0%):   1%|█▏                                                                                  | 116/8626 [01:08<1:13:52,  1.92it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/22 00:51:53 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'A mass $m$ moves in one dimension and is subject to a constant force $+F_0$ when $x<0$ and to a constant force $-F_0$ when $x>0$. Describe the motion by constructing a phase diagram. Calculate the period of the motion in terms of $m, F_0$, and the amplitude $A$ (disregard damping) .', 'options': {'A': '2 $\\\\sqrt{\\\\frac{m A}{F_0}}$', 'B': '6 $\\\\sqrt{\\\\frac{2 m A}{F_0}}$', 'C': '4 $\\\\sqrt{\\\\frac{m A}{F_0}}$', 'D': '2 $\\\\sqrt{\\\\frac{2 m A}{F_0}}$', 'E': '$\\\\pi \\\\sqrt{\\\\frac{2 m A}{F_0}}$', 'F': '$\\\\sqrt{\\\\frac{8 m A}{F_0}}$', 'G': '4 $\\\\sqrt{\\\\frac{m A}{2 F_0}}$', 'H': '$\\\\sqrt{\\\\frac{m A}{2 F_0}}$', 'I': ' 4 $\\\\sqrt{\\\\frac{2 m A}{F_0}}$', 'J': '$\\\\sqrt{\\\\frac{4 m A}{F_0}}$'}, 'reasoning': '', 'answer': 'D'}) (input_keys={'question', 'options'}): Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']). Set `provide_traceback=True` to see the stack trace.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average Metric: 0.00 / 121 (0.0%):   1%|█▏                                                                                  | 123/8626 [01:11<1:49:26,  1.29it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/22 00:51:57 ERROR dspy.utils.parallelizer: Error processing item Example({'question': '(a) 10 annual mortgage payments of $1,000, (b) 12 monthly payments of $100 on his automobile, (c) a bill for $2,000 due in two years, (d) a bill for $1,000 due today. Using an annual interest rate of 12 percent (nominal rate on the automobile loan and effective rate on all other debts), determine the annual amount necessary to retire the entire debt in 15 years.', 'options': {'A': '$1,225.00', 'B': '$1,500.00', 'C': '$1,580.42', 'D': '$1,250.00', 'E': '$1,450.00', 'F': '$1,375.69', 'G': '$1,600.00', 'H': '$1,700.00', 'I': '$1,305.75', 'J': '$1,520.34'}, 'reasoning': '', 'answer': 'D'}) (input_keys={'question', 'options'}): Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']). Set `provide_traceback=True` to see the stack trace.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average Metric: 0.00 / 143 (0.0%):   2%|█▍                                                                                  | 146/8626 [01:24<1:19:06,  1.79it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/22 00:52:09 WARNING dspy.utils.parallelizer: Received SIGINT. Cancelling execution.\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "print(\"Starting execution...\")\n",
    "eval_subset_size = len(testset)\n",
    "evaluate(\n",
    "    program,\n",
    "    devset=testset[:eval_subset_size],\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Optimize Subset + Evaluation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/21 15:46:14 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
      "RUNNING WITH THE FOLLOWING LIGHT AUTO RUN SETTINGS:\n",
      "num_trials: 7\n",
      "minibatch: False\n",
      "num_candidates: 5\n",
      "valset size: 20\n",
      "\n",
      "2025/01/21 15:46:14 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
      "==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==\n",
      "2025/01/21 15:46:14 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.\n",
      "\n",
      "2025/01/21 15:46:14 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=5 sets of demonstrations...\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapping set 1/5\n",
      "Bootstrapping set 2/5\n",
      "Bootstrapping set 3/5\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 25%|███████████████████████████████▎                                                                                             | 5/20 [00:19<00:59,  3.98s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 4 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.\n",
      "Bootstrapping set 4/5\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 20%|█████████████████████████                                                                                                    | 4/20 [00:34<02:17,  8.61s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.\n",
      "Bootstrapping set 5/5\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 30%|█████████████████████████████████████▌                                                                                       | 6/20 [00:39<01:31,  6.52s/it]\n",
      "2025/01/21 15:47:48 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
      "==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==\n",
      "2025/01/21 15:47:48 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 2 full traces after 6 examples for up to 1 rounds, amounting to 6 attempts.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/21 15:48:03 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
      "Proposing instructions...\n",
      "\n",
      "2025/01/21 15:48:30 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:\n",
      "\n",
      "2025/01/21 15:48:30 INFO dspy.teleprompt.mipro_optimizer_v2: 0: You are a helpful assistant designed to help with multiple choice question.\n",
      "\n",
      "2025/01/21 15:48:30 INFO dspy.teleprompt.mipro_optimizer_v2: 1: You are a helpful assistant designed to help with multiple choice questions from various domains, including physics, mathematics, biology, economics, law, and social sciences. Given a question and a set of options, please provide a step-by-step reasoning process to arrive at the correct answer, and then select the correct answer from the options provided. Your response should include a detailed explanation of your thought process, ensuring that each step is clearly described and logically connected to the next, ultimately leading to the selection of the correct answer.\n",
      "\n",
      "2025/01/21 15:48:30 INFO dspy.teleprompt.mipro_optimizer_v2: 2: To answer multiple-choice questions accurately, I will carefully analyze the question and options, generate a step-by-step reasoning process based on the context and available information, and then select the correct answer from the options provided. I will utilize my knowledge of various subjects, including physics, mathematics, biology, economics, law, and social sciences, to provide well-reasoned and informative responses. My goal is to provide clear and concise reasoning for each question, making it easier to understand the thought process behind the answer. I will consider the different theories, concepts, and formulas relevant to each subject area to ensure that my responses are accurate and helpful.\n",
      "\n",
      "2025/01/21 15:48:30 INFO dspy.teleprompt.mipro_optimizer_v2: 3: You are a knowledgeable assistant skilled in solving multiple-choice questions across various domains, including physics, mathematics, biology, economics, law, and social sciences. Your task is to provide step-by-step reasoning for a given question and set of options, ultimately selecting the correct answer. To accomplish this, carefully read the question, analyze the provided options, and apply relevant formulas, principles, and concepts from the respective field to deduce the correct answer. Generate a detailed explanation of your thought process, making your reasoning transparent and easy to follow. Ensure your response includes the correct answer choice (A, B, C, D, etc.) and a clear, step-by-step justification for your selection.\n",
      "\n",
      "2025/01/21 15:48:30 INFO dspy.teleprompt.mipro_optimizer_v2: 4: You are a highly skilled expert witness in a high-stakes courtroom trial, and your task is to analyze complex multiple-choice questions and provide step-by-step reasoning to support your answer. The questions will cover a wide range of subjects, including physics, mathematics, biology, economics, law, and social sciences. Your goal is to think critically and arrive at the correct answer, while also providing a clear and coherent explanation of your thought process. The jury is counting on your expertise to make an informed decision, and the outcome of the trial hangs in the balance. Please respond with the correct answer and your reasoning.\n",
      "\n",
      "2025/01/21 15:48:30 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
      "\n",
      "2025/01/21 15:48:30 INFO dspy.teleprompt.mipro_optimizer_v2: Evaluating the default program...\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average Metric: 15.00 / 20 (75.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:21<00:00,  1.09s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/21 15:48:52 INFO dspy.evaluate.evaluate: Average Metric: 15 / 20 (75.0%)\n",
      "2025/01/21 15:48:52 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 75.0\n",
      "\n",
      "2025/01/21 15:48:52 INFO dspy.teleprompt.mipro_optimizer_v2: ==> STEP 3: FINDING OPTIMAL PROMPT PARAMETERS <==\n",
      "2025/01/21 15:48:52 INFO dspy.teleprompt.mipro_optimizer_v2: We will evaluate the program over a series of trials with different combinations of instructions and few-shot examples to find the optimal combination using Bayesian Optimization.\n",
      "\n",
      "/home/justinai/.conda/envs/prompt-migration/lib/python3.10/site-packages/optuna/_experimental.py:31: ExperimentalWarning: Argument ``multivariate`` is an experimental feature. The interface can change in the future.\n",
      "  warnings.warn(\n",
      "2025/01/21 15:48:52 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 1 / 7 =====\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 14.00 / 20 (70.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:15<00:00,  1.30it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/21 15:49:08 INFO dspy.evaluate.evaluate: Average Metric: 14 / 20 (70.0%)\n",
      "2025/01/21 15:49:08 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 70.0 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 1'].\n",
      "2025/01/21 15:49:08 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [75.0, 70.0]\n",
      "2025/01/21 15:49:08 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 75.0\n",
      "2025/01/21 15:49:08 INFO dspy.teleprompt.mipro_optimizer_v2: =======================\n",
      "\n",
      "\n",
      "2025/01/21 15:49:08 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 2 / 7 =====\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 15.00 / 20 (75.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:15<00:00,  1.31it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/21 15:49:23 INFO dspy.evaluate.evaluate: Average Metric: 15 / 20 (75.0%)\n",
      "2025/01/21 15:49:23 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 75.0 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 1'].\n",
      "2025/01/21 15:49:23 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [75.0, 70.0, 75.0]\n",
      "2025/01/21 15:49:23 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 75.0\n",
      "2025/01/21 15:49:23 INFO dspy.teleprompt.mipro_optimizer_v2: =======================\n",
      "\n",
      "\n",
      "2025/01/21 15:49:23 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 3 / 7 =====\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 15.00 / 20 (75.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:13<00:00,  1.45it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/21 15:49:37 INFO dspy.evaluate.evaluate: Average Metric: 15 / 20 (75.0%)\n",
      "2025/01/21 15:49:37 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 75.0 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 1'].\n",
      "2025/01/21 15:49:37 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [75.0, 70.0, 75.0, 75.0]\n",
      "2025/01/21 15:49:37 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 75.0\n",
      "2025/01/21 15:49:37 INFO dspy.teleprompt.mipro_optimizer_v2: =======================\n",
      "\n",
      "\n",
      "2025/01/21 15:49:37 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 4 / 7 =====\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 15.00 / 20 (75.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 3087.23it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/21 15:49:37 INFO dspy.evaluate.evaluate: Average Metric: 15 / 20 (75.0%)\n",
      "2025/01/21 15:49:37 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 75.0 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 1'].\n",
      "2025/01/21 15:49:37 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [75.0, 70.0, 75.0, 75.0, 75.0]\n",
      "2025/01/21 15:49:37 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 75.0\n",
      "2025/01/21 15:49:37 INFO dspy.teleprompt.mipro_optimizer_v2: =======================\n",
      "\n",
      "\n",
      "2025/01/21 15:49:37 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 5 / 7 =====\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 16.00 / 20 (80.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:15<00:00,  1.31it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/21 15:49:53 INFO dspy.evaluate.evaluate: Average Metric: 16 / 20 (80.0%)\n",
      "2025/01/21 15:49:53 INFO dspy.teleprompt.mipro_optimizer_v2: \u001b[92mBest full score so far!\u001b[0m Score: 80.0\n",
      "2025/01/21 15:49:53 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 80.0 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 3'].\n",
      "2025/01/21 15:49:53 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [75.0, 70.0, 75.0, 75.0, 75.0, 80.0]\n",
      "2025/01/21 15:49:53 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 80.0\n",
      "2025/01/21 15:49:53 INFO dspy.teleprompt.mipro_optimizer_v2: =======================\n",
      "\n",
      "\n",
      "2025/01/21 15:49:53 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 6 / 7 =====\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 14.00 / 20 (70.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:15<00:00,  1.30it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/21 15:50:08 INFO dspy.evaluate.evaluate: Average Metric: 14 / 20 (70.0%)\n",
      "2025/01/21 15:50:08 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 70.0 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 1'].\n",
      "2025/01/21 15:50:08 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [75.0, 70.0, 75.0, 75.0, 75.0, 80.0, 70.0]\n",
      "2025/01/21 15:50:08 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 80.0\n",
      "2025/01/21 15:50:08 INFO dspy.teleprompt.mipro_optimizer_v2: =======================\n",
      "\n",
      "\n",
      "2025/01/21 15:50:08 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 7 / 7 =====\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 14.00 / 20 (70.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:13<00:00,  1.45it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/21 15:50:22 INFO dspy.evaluate.evaluate: Average Metric: 14 / 20 (70.0%)\n",
      "2025/01/21 15:50:22 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 70.0 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 4'].\n",
      "2025/01/21 15:50:22 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [75.0, 70.0, 75.0, 75.0, 75.0, 80.0, 70.0, 70.0]\n",
      "2025/01/21 15:50:22 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 80.0\n",
      "2025/01/21 15:50:22 INFO dspy.teleprompt.mipro_optimizer_v2: =======================\n",
      "\n",
      "\n",
      "2025/01/21 15:50:22 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 80.0!\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "subset_size = 20\n",
    "optimizer = dspy.MIPROv2(\n",
    "    metric=benchmark.metric,\n",
    "    auto=\"light\",\n",
    "    num_threads=NUM_THREADS,\n",
    "    task_model=TASK_MODEL,\n",
    "    prompt_model=PROMPT_MODEL,\n",
    "    max_labeled_demos=FEW_SHOTS,\n",
    ")\n",
    "\n",
    "optimized_program = optimizer.compile(\n",
    "    program,\n",
    "    trainset=trainset[:subset_size],\n",
    "    valset=valset[:subset_size],\n",
    "    requires_permission_to_run=False,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "BEST PROMPT:\n",
      " You are a highly skilled expert witness in a high-stakes courtroom trial, and your task is to analyze complex multiple-choice questions and provide step-by-step reasoning to support your answer. The questions will cover a wide range of subjects, including physics, mathematics, biology, economics, law, and social sciences. Your goal is to think critically and arrive at the correct answer, while also providing a clear and coherent explanation of your thought process. The jury is counting on your expertise to make an informed decision, and the outcome of the trial hangs in the balance. Please respond with the correct answer and your reasoning.\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "print(\"BEST PROMPT:\\n\", optimized_program.predict.signature.instructions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average Metric: 417.00 / 570 (73.2%):   7%|█████▏                                                                        | 574/8626 [6:54:48<96:58:50, 43.36s/it]\n",
      "Average Metric: 218.00 / 286 (76.2%):   3%|██▊                                                                                | 286/8626 [02:32<59:39,  2.33it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/21 23:37:28 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'Consider a thoroughly mixed vessel where a salt is dis-solved inwater. The volume of the fresh water initially in the tank is 100lbm.The inlet conditions are,ṁ_w= 150 lb/hr. and ṁ_s= 30 lb/hr. The resulting solution leaves at a rate of 120 lb/hr. If the flow in and out remain constant, compute the outletconcentration after one hour.', 'options': {'A': '0.86', 'B': '0.76', 'C': '0.46', 'D': '0.16', 'E': '0.06', 'F': '0.26', 'G': '0.96', 'H': '0.56', 'I': '0.36', 'J': '0.66'}, 'answer': ''}) (input_keys={'options', 'question'}): Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']). Set `provide_traceback=True` to see the stack trace.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average Metric: 452.00 / 604 (74.8%):   7%|█████▋                                                                           | 604/8626 [05:18<1:17:58,  1.71it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/21 23:40:13 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'Two identical conducting spheres, fixed in place, attract each other with an electrostatic force of $0.108 \\\\mathrm{~N}$ when their center-to-center separation is $50.0 \\\\mathrm{~cm}$. The spheres are then connected by a thin conducting wire. When the wire is removed, the spheres repel each other with an electrostatic force of $0.0360 \\\\mathrm{~N}$. Of the initial charges on the spheres, with a positive net charge, what was (a) the negative charge on one of them?', 'options': {'A': '$-2.00 \\\\mu \\\\mathrm{C}$', 'B': '$-2.50 \\\\mu \\\\mathrm{C}$', 'C': '$-0.50 \\\\mu \\\\mathrm{C}$', 'D': '$-1.75 \\\\mu \\\\mathrm{C}$', 'E': '$-0.75 \\\\mu \\\\mathrm{C}$', 'F': ' $-1.00 \\\\mu \\\\mathrm{C}$$ \\\\mu \\\\mathrm{C}$', 'G': '$-0.25 \\\\mu \\\\mathrm{C}$', 'H': '$-1.50 \\\\mu \\\\mathrm{C}$', 'I': '$-3.00 \\\\mu \\\\mathrm{C}$', 'J': '$-1.25 \\\\mu \\\\mathrm{C}$'}, 'answer': ''}) (input_keys={'options', 'question'}): Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']). Set `provide_traceback=True` to see the stack trace.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average Metric: 1075.00 / 1439 (74.7%):  17%|█████████████▎                                                                  | 1441/8626 [12:30<51:52,  2.31it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/21 23:47:24 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'Assume that all gases are perfect and that data refer to 298 K unless otherwise stated. Find an expression for the fugacity coefficient of a gas that obeys the equation of state $p V_{\\\\mathrm{m}}=R T\\\\left(1+B / V_{\\\\mathrm{m}}+C / V_{\\\\mathrm{m}}^2\\\\right)$. Use the resulting expression to estimate the fugacity of argon at 1.00 atm and $100 \\\\mathrm{~K}$ using $B=-21.13 \\\\mathrm{~cm}^3 \\\\mathrm{~mol}^{-1}$ and $C=1054 \\\\mathrm{~cm}^6 \\\\mathrm{~mol}^{-2}$.', 'options': {'A': '1.0236$\\\\text{atm}$', 'B': '1.0567 atm', 'C': '0.9852 atm', 'D': ' 0.9974$\\\\text{atm}$ ', 'E': '0.9321 atm', 'F': '1.0000 atm', 'G': '1.0150 atm', 'H': '0.9125$\\\\text{atm}$', 'I': '1.1024$\\\\text{atm}$', 'J': '0.9500 atm'}, 'answer': ''}) (input_keys={'options', 'question'}): 'list' object has no attribute 'items'. Set `provide_traceback=True` to see the stack trace.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average Metric: 1319.00 / 1758 (75.0%):  20%|████████████████▎                                                               | 1761/8626 [15:12<57:12,  2.00it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/21 23:50:08 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'The ${ }^7 \\\\mathrm{Li}^1 \\\\mathrm{H}$ ground electronic state has $D_0=2.4287 \\\\mathrm{eV}, \\\\nu_e / c=1405.65 \\\\mathrm{~cm}^{-1}$, and $\\\\nu_e x_e / c=23.20 \\\\mathrm{~cm}^{-1}$, where $c$ is the speed of light. (These last two quantities are usually designated $\\\\omega_e$ and $\\\\omega_e x_e$ in the literature.) Calculate $D_e$ for ${ }^7 \\\\mathrm{Li}^1 \\\\mathrm{H}$.', 'options': {'A': '2.4000 eV', 'B': '2.6000 $\\\\mathrm{eV}$', 'C': '2.5500 eV', 'D': '2.3000 $\\\\mathrm{eV}$', 'E': '2.4287 $\\\\mathrm{eV}$', 'F': '2.3500 eV', 'G': ' 2.5151 $\\\\mathrm{eV}$', 'H': '2.4500 eV', 'I': '2.4850 eV', 'J': '2.5350 eV'}, 'answer': 'G'}) (input_keys={'options', 'question'}): 'list' object has no attribute 'items'. Set `provide_traceback=True` to see the stack trace.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average Metric: 1353.00 / 1805 (75.0%):  21%|████████████████▊                                                               | 1809/8626 [15:38<54:42,  2.08it/s]"
     ]
    },
    {
     "ename": "ValueError",
     "evalue": "Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning'])",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "File \u001b[0;32m~/.conda/envs/prompt-migration/lib/python3.10/site-packages/dspy/adapters/base.py:30\u001b[0m, in \u001b[0;36mAdapter.__call__\u001b[0;34m(self, lm, lm_kwargs, signature, demos, inputs)\u001b[0m\n\u001b[1;32m     28\u001b[0m     output, output_logprobs \u001b[38;5;241m=\u001b[39m output[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtext\u001b[39m\u001b[38;5;124m\"\u001b[39m], output[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlogprobs\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[0;32m---> 30\u001b[0m value \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mparse\u001b[49m\u001b[43m(\u001b[49m\u001b[43msignature\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutput\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     32\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mset\u001b[39m(value\u001b[38;5;241m.\u001b[39mkeys()) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mset\u001b[39m(signature\u001b[38;5;241m.\u001b[39moutput_fields\u001b[38;5;241m.\u001b[39mkeys()), \\\n\u001b[1;32m     33\u001b[0m     \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mExpected \u001b[39m\u001b[38;5;132;01m{\u001b[39;00msignature\u001b[38;5;241m.\u001b[39moutput_fields\u001b[38;5;241m.\u001b[39mkeys()\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m but got \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mvalue\u001b[38;5;241m.\u001b[39mkeys()\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n",
      "File \u001b[0;32m~/.conda/envs/prompt-migration/lib/python3.10/site-packages/dspy/utils/callback.py:234\u001b[0m, in \u001b[0;36mwith_callbacks.<locals>.wrapper\u001b[0;34m(instance, *args, **kwargs)\u001b[0m\n\u001b[1;32m    233\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m callbacks:\n\u001b[0;32m--> 234\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[43minstance\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    236\u001b[0m \u001b[38;5;66;03m# Generate call ID as the unique identifier for the call, this is useful for instrumentation.\u001b[39;00m\n",
      "File \u001b[0;32m~/.conda/envs/prompt-migration/lib/python3.10/site-packages/dspy/adapters/chat_adapter.py:84\u001b[0m, in \u001b[0;36mChatAdapter.parse\u001b[0;34m(self, signature, completion)\u001b[0m\n\u001b[1;32m     83\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m fields\u001b[38;5;241m.\u001b[39mkeys() \u001b[38;5;241m!=\u001b[39m signature\u001b[38;5;241m.\u001b[39moutput_fields\u001b[38;5;241m.\u001b[39mkeys():\n\u001b[0;32m---> 84\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mExpected \u001b[39m\u001b[38;5;132;01m{\u001b[39;00msignature\u001b[38;5;241m.\u001b[39moutput_fields\u001b[38;5;241m.\u001b[39mkeys()\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m but got \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfields\u001b[38;5;241m.\u001b[39mkeys()\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m     86\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m fields\n",
      "\u001b[0;31mValueError\u001b[0m: Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning'])",
      "\nDuring handling of the above exception, another exception occurred:\n",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[11], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m eval_subset_size \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(testset)\n\u001b[0;32m----> 2\u001b[0m score, results, all_scores \u001b[38;5;241m=\u001b[39m \u001b[43mevaluate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m      3\u001b[0m \u001b[43m    \u001b[49m\u001b[43moptimized_program\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m      4\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdevset\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtestset\u001b[49m\u001b[43m[\u001b[49m\u001b[43m:\u001b[49m\u001b[43meval_subset_size\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m      5\u001b[0m \u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/.conda/envs/prompt-migration/lib/python3.10/site-packages/dspy/evaluate/evaluate.py:112\u001b[0m, in \u001b[0;36mEvaluate.__call__\u001b[0;34m(self, program, metric, devset, num_threads, display_progress, display_table, return_all_scores, return_outputs)\u001b[0m\n\u001b[1;32m    108\u001b[0m         program\u001b[38;5;241m.\u001b[39m_suggest_failures \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m dspy\u001b[38;5;241m.\u001b[39msettings\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msuggest_failures\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m    110\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m prediction, score\n\u001b[0;32m--> 112\u001b[0m results \u001b[38;5;241m=\u001b[39m \u001b[43mexecutor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexecute\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprocess_item\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdevset\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    113\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(devset) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mlen\u001b[39m(results)\n\u001b[1;32m    115\u001b[0m results \u001b[38;5;241m=\u001b[39m [((dspy\u001b[38;5;241m.\u001b[39mPrediction(), \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfailure_score) \u001b[38;5;28;01mif\u001b[39;00m r \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m r) \u001b[38;5;28;01mfor\u001b[39;00m r \u001b[38;5;129;01min\u001b[39;00m results]\n",
      "File \u001b[0;32m~/.conda/envs/prompt-migration/lib/python3.10/site-packages/dspy/utils/parallelizer.py:39\u001b[0m, in \u001b[0;36mParallelExecutor.execute\u001b[0;34m(self, function, data)\u001b[0m\n\u001b[1;32m     37\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_execute_isolated_single_thread(wrapped_function, data)\n\u001b[1;32m     38\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 39\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_execute_multi_thread\u001b[49m\u001b[43m(\u001b[49m\u001b[43mwrapped_function\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/.conda/envs/prompt-migration/lib/python3.10/site-packages/dspy/utils/parallelizer.py:180\u001b[0m, in \u001b[0;36mParallelExecutor._execute_multi_thread\u001b[0;34m(self, function, data)\u001b[0m\n\u001b[1;32m    172\u001b[0m pbar \u001b[38;5;241m=\u001b[39m tqdm\u001b[38;5;241m.\u001b[39mtqdm(\n\u001b[1;32m    173\u001b[0m     total\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mlen\u001b[39m(data),\n\u001b[1;32m    174\u001b[0m     dynamic_ncols\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m    175\u001b[0m     disable\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdisable_progress_bar,\n\u001b[1;32m    176\u001b[0m     file\u001b[38;5;241m=\u001b[39msys\u001b[38;5;241m.\u001b[39mstdout\n\u001b[1;32m    177\u001b[0m )\n\u001b[1;32m    179\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m future \u001b[38;5;129;01min\u001b[39;00m as_completed(futures):\n\u001b[0;32m--> 180\u001b[0m     index, result \u001b[38;5;241m=\u001b[39m \u001b[43mfuture\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresult\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    182\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m result \u001b[38;5;129;01mis\u001b[39;00m job_cancelled:\n\u001b[1;32m    183\u001b[0m         \u001b[38;5;28;01mcontinue\u001b[39;00m\n",
      "File \u001b[0;32m~/.conda/envs/prompt-migration/lib/python3.10/concurrent/futures/_base.py:451\u001b[0m, in \u001b[0;36mFuture.result\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    449\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m CancelledError()\n\u001b[1;32m    450\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_state \u001b[38;5;241m==\u001b[39m FINISHED:\n\u001b[0;32m--> 451\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__get_result\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    453\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_condition\u001b[38;5;241m.\u001b[39mwait(timeout)\n\u001b[1;32m    455\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_state \u001b[38;5;129;01min\u001b[39;00m [CANCELLED, CANCELLED_AND_NOTIFIED]:\n",
      "File \u001b[0;32m~/.conda/envs/prompt-migration/lib/python3.10/concurrent/futures/_base.py:403\u001b[0m, in \u001b[0;36mFuture.__get_result\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    401\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_exception:\n\u001b[1;32m    402\u001b[0m     \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 403\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_exception\n\u001b[1;32m    404\u001b[0m     \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m    405\u001b[0m         \u001b[38;5;66;03m# Break a reference cycle with the exception in self._exception\u001b[39;00m\n\u001b[1;32m    406\u001b[0m         \u001b[38;5;28mself\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
      "File \u001b[0;32m~/.conda/envs/prompt-migration/lib/python3.10/concurrent/futures/thread.py:58\u001b[0m, in \u001b[0;36m_WorkItem.run\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m     55\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[1;32m     57\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m---> 58\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     59\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mBaseException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[1;32m     60\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfuture\u001b[38;5;241m.\u001b[39mset_exception(exc)\n",
      "File \u001b[0;32m~/.conda/envs/prompt-migration/lib/python3.10/site-packages/dspy/utils/parallelizer.py:158\u001b[0m, in \u001b[0;36mParallelExecutor._execute_multi_thread.<locals>.cancellable_function\u001b[0;34m(parent_overrides, index_item)\u001b[0m\n\u001b[1;32m    155\u001b[0m thread_local_overrides\u001b[38;5;241m.\u001b[39moverrides \u001b[38;5;241m=\u001b[39m parent_overrides\u001b[38;5;241m.\u001b[39mcopy()\n\u001b[1;32m    157\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 158\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m index, \u001b[43mfunction\u001b[49m\u001b[43m(\u001b[49m\u001b[43mitem\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    159\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m    160\u001b[0m     thread_local_overrides\u001b[38;5;241m.\u001b[39moverrides \u001b[38;5;241m=\u001b[39m original_overrides\n",
      "File \u001b[0;32m~/.conda/envs/prompt-migration/lib/python3.10/site-packages/dspy/utils/parallelizer.py:54\u001b[0m, in \u001b[0;36mParallelExecutor._wrap_function.<locals>.wrapped\u001b[0;34m(item)\u001b[0m\n\u001b[1;32m     52\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m current_error_count \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmax_errors:\n\u001b[1;32m     53\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcancel_jobs\u001b[38;5;241m.\u001b[39mset()\n\u001b[0;32m---> 54\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m e\n\u001b[1;32m     55\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprovide_traceback:\n\u001b[1;32m     56\u001b[0m     logger\u001b[38;5;241m.\u001b[39merror(\n\u001b[1;32m     57\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mError processing item \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mitem\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mStack trace:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;132;01m{\u001b[39;00mtraceback\u001b[38;5;241m.\u001b[39mformat_exc()\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     58\u001b[0m     )\n",
      "File \u001b[0;32m~/.conda/envs/prompt-migration/lib/python3.10/site-packages/dspy/utils/parallelizer.py:47\u001b[0m, in \u001b[0;36mParallelExecutor._wrap_function.<locals>.wrapped\u001b[0;34m(item)\u001b[0m\n\u001b[1;32m     45\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m     46\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m---> 47\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunction\u001b[49m\u001b[43m(\u001b[49m\u001b[43mitem\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     48\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m     49\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39merror_lock:\n",
      "File \u001b[0;32m~/.conda/envs/prompt-migration/lib/python3.10/site-packages/dspy/evaluate/evaluate.py:101\u001b[0m, in \u001b[0;36mEvaluate.__call__.<locals>.process_item\u001b[0;34m(example)\u001b[0m\n\u001b[1;32m    100\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mprocess_item\u001b[39m(example):\n\u001b[0;32m--> 101\u001b[0m     prediction \u001b[38;5;241m=\u001b[39m \u001b[43mprogram\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mexample\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minputs\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    102\u001b[0m     score \u001b[38;5;241m=\u001b[39m metric(example, prediction)\n\u001b[1;32m    104\u001b[0m     \u001b[38;5;66;03m# Increment assert and suggest failures to program's attributes\u001b[39;00m\n",
      "File \u001b[0;32m~/.conda/envs/prompt-migration/lib/python3.10/site-packages/dspy/utils/callback.py:234\u001b[0m, in \u001b[0;36mwith_callbacks.<locals>.wrapper\u001b[0;34m(instance, *args, **kwargs)\u001b[0m\n\u001b[1;32m    232\u001b[0m \u001b[38;5;66;03m# If no callbacks are provided, just call the function\u001b[39;00m\n\u001b[1;32m    233\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m callbacks:\n\u001b[0;32m--> 234\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[43minstance\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    236\u001b[0m \u001b[38;5;66;03m# Generate call ID as the unique identifier for the call, this is useful for instrumentation.\u001b[39;00m\n\u001b[1;32m    237\u001b[0m call_id \u001b[38;5;241m=\u001b[39m uuid\u001b[38;5;241m.\u001b[39muuid4()\u001b[38;5;241m.\u001b[39mhex\n",
      "File \u001b[0;32m~/.conda/envs/prompt-migration/lib/python3.10/site-packages/dspy/primitives/program.py:22\u001b[0m, in \u001b[0;36mModule.__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m     20\u001b[0m \u001b[38;5;129m@with_callbacks\u001b[39m\n\u001b[1;32m     21\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m---> 22\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mforward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/.conda/envs/prompt-migration/lib/python3.10/site-packages/dspy/predict/chain_of_thought.py:20\u001b[0m, in \u001b[0;36mChainOfThought.forward\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m     19\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m---> 20\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpredict\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/.conda/envs/prompt-migration/lib/python3.10/site-packages/dspy/utils/callback.py:234\u001b[0m, in \u001b[0;36mwith_callbacks.<locals>.wrapper\u001b[0;34m(instance, *args, **kwargs)\u001b[0m\n\u001b[1;32m    232\u001b[0m \u001b[38;5;66;03m# If no callbacks are provided, just call the function\u001b[39;00m\n\u001b[1;32m    233\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m callbacks:\n\u001b[0;32m--> 234\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[43minstance\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    236\u001b[0m \u001b[38;5;66;03m# Generate call ID as the unique identifier for the call, this is useful for instrumentation.\u001b[39;00m\n\u001b[1;32m    237\u001b[0m call_id \u001b[38;5;241m=\u001b[39m uuid\u001b[38;5;241m.\u001b[39muuid4()\u001b[38;5;241m.\u001b[39mhex\n",
      "File \u001b[0;32m~/.conda/envs/prompt-migration/lib/python3.10/site-packages/dspy/predict/predict.py:81\u001b[0m, in \u001b[0;36mPredict.__call__\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m     79\u001b[0m \u001b[38;5;129m@with_callbacks\u001b[39m\n\u001b[1;32m     80\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m---> 81\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mforward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/.conda/envs/prompt-migration/lib/python3.10/site-packages/dspy/predict/predict.py:111\u001b[0m, in \u001b[0;36mPredict.forward\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m    109\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mdspy\u001b[39;00m\n\u001b[1;32m    110\u001b[0m adapter \u001b[38;5;241m=\u001b[39m dspy\u001b[38;5;241m.\u001b[39msettings\u001b[38;5;241m.\u001b[39madapter \u001b[38;5;129;01mor\u001b[39;00m dspy\u001b[38;5;241m.\u001b[39mChatAdapter()\n\u001b[0;32m--> 111\u001b[0m completions \u001b[38;5;241m=\u001b[39m \u001b[43madapter\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlm\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlm_kwargs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msignature\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msignature\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdemos\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdemos\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    113\u001b[0m pred \u001b[38;5;241m=\u001b[39m Prediction\u001b[38;5;241m.\u001b[39mfrom_completions(completions, signature\u001b[38;5;241m=\u001b[39msignature)\n\u001b[1;32m    115\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m kwargs\u001b[38;5;241m.\u001b[39mpop(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_trace\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mTrue\u001b[39;00m) \u001b[38;5;129;01mand\u001b[39;00m dspy\u001b[38;5;241m.\u001b[39msettings\u001b[38;5;241m.\u001b[39mtrace \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
      "File \u001b[0;32m~/.conda/envs/prompt-migration/lib/python3.10/site-packages/dspy/adapters/base.py:45\u001b[0m, in \u001b[0;36mAdapter.__call__\u001b[0;34m(self, lm, lm_kwargs, signature, demos, inputs)\u001b[0m\n\u001b[1;32m     43\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mjson_adapter\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m JSONAdapter\n\u001b[1;32m     44\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\u001b[38;5;28mself\u001b[39m, JSONAdapter):\n\u001b[0;32m---> 45\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mJSONAdapter\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlm\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlm_kwargs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msignature\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdemos\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     46\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e\n",
      "File \u001b[0;32m~/.conda/envs/prompt-migration/lib/python3.10/site-packages/dspy/adapters/json_adapter.py:60\u001b[0m, in \u001b[0;36mJSONAdapter.__call__\u001b[0;34m(self, lm, lm_kwargs, signature, demos, inputs)\u001b[0m\n\u001b[1;32m     57\u001b[0m values \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m     59\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m output \u001b[38;5;129;01min\u001b[39;00m outputs:\n\u001b[0;32m---> 60\u001b[0m     value \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mparse\u001b[49m\u001b[43m(\u001b[49m\u001b[43msignature\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutput\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     61\u001b[0m     \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mset\u001b[39m(value\u001b[38;5;241m.\u001b[39mkeys()) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mset\u001b[39m(\n\u001b[1;32m     62\u001b[0m         signature\u001b[38;5;241m.\u001b[39moutput_fields\u001b[38;5;241m.\u001b[39mkeys()\n\u001b[1;32m     63\u001b[0m     ), \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mExpected \u001b[39m\u001b[38;5;132;01m{\u001b[39;00msignature\u001b[38;5;241m.\u001b[39moutput_fields\u001b[38;5;241m.\u001b[39mkeys()\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m but got \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mvalue\u001b[38;5;241m.\u001b[39mkeys()\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     64\u001b[0m     values\u001b[38;5;241m.\u001b[39mappend(value)\n",
      "File \u001b[0;32m~/.conda/envs/prompt-migration/lib/python3.10/site-packages/dspy/utils/callback.py:234\u001b[0m, in \u001b[0;36mwith_callbacks.<locals>.wrapper\u001b[0;34m(instance, *args, **kwargs)\u001b[0m\n\u001b[1;32m    232\u001b[0m \u001b[38;5;66;03m# If no callbacks are provided, just call the function\u001b[39;00m\n\u001b[1;32m    233\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m callbacks:\n\u001b[0;32m--> 234\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[43minstance\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    236\u001b[0m \u001b[38;5;66;03m# Generate call ID as the unique identifier for the call, this is useful for instrumentation.\u001b[39;00m\n\u001b[1;32m    237\u001b[0m call_id \u001b[38;5;241m=\u001b[39m uuid\u001b[38;5;241m.\u001b[39muuid4()\u001b[38;5;241m.\u001b[39mhex\n",
      "File \u001b[0;32m~/.conda/envs/prompt-migration/lib/python3.10/site-packages/dspy/adapters/json_adapter.py:102\u001b[0m, in \u001b[0;36mJSONAdapter.parse\u001b[0;34m(self, signature, completion)\u001b[0m\n\u001b[1;32m     99\u001b[0m         fields[k] \u001b[38;5;241m=\u001b[39m parse_value(v, signature\u001b[38;5;241m.\u001b[39moutput_fields[k]\u001b[38;5;241m.\u001b[39mannotation)\n\u001b[1;32m    101\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m fields\u001b[38;5;241m.\u001b[39mkeys() \u001b[38;5;241m!=\u001b[39m signature\u001b[38;5;241m.\u001b[39moutput_fields\u001b[38;5;241m.\u001b[39mkeys():\n\u001b[0;32m--> 102\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mExpected \u001b[39m\u001b[38;5;132;01m{\u001b[39;00msignature\u001b[38;5;241m.\u001b[39moutput_fields\u001b[38;5;241m.\u001b[39mkeys()\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m but got \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfields\u001b[38;5;241m.\u001b[39mkeys()\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m    104\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m fields\n",
      "\u001b[0;31mValueError\u001b[0m: Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning'])"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "eval_subset_size = len(testset)\n",
    "score, results, all_scores = evaluate(\n",
    "    optimized_program,\n",
    "    devset=testset[:eval_subset_size],\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Medium Optimization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "subset_size = 500\n",
    "optimizer = dspy.MIPROv2(\n",
    "    metric=benchmark.metric,\n",
    "    auto=\"medium\",\n",
    "    num_threads=NUM_THREADS,\n",
    "    task_model=TASK_MODEL,\n",
    "    prompt_model=PROMPT_MODEL,\n",
    "    max_labeled_demos=FEW_SHOTS,\n",
    ")\n",
    "\n",
    "optimized_program = optimizer.compile(\n",
    "    program,\n",
    "    trainset=trainset[:subset_size],\n",
    "    valset=valset[:subset_size],\n",
    "    requires_permission_to_run=False,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"BEST PROMPT:\\n\", optimized_program.predict.signature.instructions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"BEST PROMPT:\\n\", optimized_program.predict.signature.instructions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"BEST EXAMPLES:\\n\", optimized_program.predict.demos)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "score, results, all_scores = evaluate(\n",
    "    optimized_program,\n",
    "    devset=testset[:subset_size],\n",
    "    display_table=False,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "eval_medium_subset_size = 300\n",
    "score, results, all_scores = evaluate(\n",
    "    optimized_program,\n",
    "    devset=testset[:eval_medium_subset_size],\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Heavy Optimization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "optimizer = dspy.MIPROv2(\n",
    "    metric=benchmark.metric,\n",
    "    auto=\"heavy\",\n",
    "    num_threads=NUM_THREADS,\n",
    "    task_model=TASK_MODEL,\n",
    "    prompt_model=PROMPT_MODEL,\n",
    "    max_labeled_demos=FEW_SHOTS,\n",
    ")\n",
    "\n",
    "optimized_program = optimizer.compile(\n",
    "    program,\n",
    "    trainset=trainset,\n",
    "    valset=valset,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"BEST PROMPT:\\n\", optimized_program.predict.signature.instructions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "score, results, all_scores = evaluate(\n",
    "    optimized_program,\n",
    "    devset=testset,\n",
    "    display_table=False,\n",
    ")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}