{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Imports & Env Setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "%reload_ext autoreload\n",
    "%autoreload 2\n",
    "import sys\n",
    "import os\n",
    "from dotenv import load_dotenv\n",
    "load_dotenv()\n",
    "from datasets import load_dataset\n",
    "\n",
    "\n",
    "import dspy\n",
    "sys.path.append(os.path.abspath('../'))\n",
    "from benchmarks import llama_mmlu_pro, leaderboard_mmlu_pro, llama_mmlu"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Configuration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "NUM_THREADS = 48\n",
    "\n",
    "FEW_SHOTS = 5\n",
    "\n",
    "# See https://docs.litellm.ai/docs/providers/vllm for details\n",
    "TASK_MODEL = dspy.LM(\n",
    "    \"hosted_vllm/meta-llama/Llama-3.3-70B-Instruct\",\n",
    "    api_base = 'http://localhost:8000/v1' , # or api_base ?\n",
    "    api_key = \"dummy\",\n",
    "    # api_version: Optional[str] = None,\n",
    "    # api_key: Optional[str] = None,\n",
    "    # seed: Optional[int] = None,\n",
    "    # max_tokens: Optional[int] = None,\n",
    "    # timeout: Optional[Union[float, int]] = None,\n",
    ")\n",
    "PROMPT_MODEL = dspy.LM(\n",
    "    \"hosted_vllm/meta-llama/Llama-3.3-70B-Instruct\",\n",
    "    api_base = 'http://localhost:8000/v1', # or api_base ?\n",
    "    api_key = \"dummy\",\n",
    "\n",
    "    # api_version: Optional[str] = None,\n",
    "    # api_key: Optional[str] = None,\n",
    "    # seed: Optional[int] = None,\n",
    "    # max_tokens: Optional[int] = None,\n",
    "    # timeout: Optional[Union[float, int]] = None,\n",
    ")\n",
    "\n",
    "dspy.configure(lm=TASK_MODEL)\n",
    "\n",
    "# replace this with llama_mmlu_pro or whatever\n",
    "benchmark = llama_mmlu\n",
    "\n",
    "# Without chain of thought:\n",
    "# program = dspy.Predict(\n",
    "#     benchmark.signature(\"\")\n",
    "# )\n",
    "\n",
    "# With chain of thought:\n",
    "program = dspy.ChainOfThought(\n",
    "    benchmark.signature(\"You are a knowledgeable and insightful assistant designed to provide guidance on multiple-choice questions. Your role involves analyzing questions, assessing answer choices, and offering well-reasoned explanations to enhance understanding. By breaking down complex concepts, you help users develop critical thinking skills and improve their decision-making process. You strive to present information in a clear, structured manner while adapting to the user's level of expertise. Ultimately, your goal is to foster deeper comprehension and confidence in tackling multiple-choice assessments.\") # put your initial system prompt here, or leave blank\n",
    ")\n",
    "\n",
    "evaluate = dspy.Evaluate(\n",
    "    devset=[],\n",
    "    max_errors = 500,\n",
    "    metric=benchmark.metric,\n",
    "    num_threads=NUM_THREADS,\n",
    "    display_progress=True,\n",
    "    display_table=True,\n",
    "    return_all_scores=True,\n",
    "    return_outputs=True,\n",
    "    provide_traceback=True\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1403, 1263, 11369)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trainset, valset, testset = benchmark.datasets(\n",
    "    train_size=0.1,\n",
    "    validation_size=0.1,\n",
    ")\n",
    "\n",
    "len(trainset), len(valset), len(testset)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Baseline Benchmark"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 146,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "BASE PROMPT:\n",
      " \n",
      "CPU times: user 348 μs, sys: 0 ns, total: 348 μs\n",
      "Wall time: 283 μs\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "print(\"BASE PROMPT:\\n\", program.signature.instructions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 147,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "BEST EXAMPLES:\n",
      " []\n",
      "CPU times: user 300 μs, sys: 16 μs, total: 316 μs\n",
      "Wall time: 248 μs\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "print(\"BEST EXAMPLES:\\n\", program.demos)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 148,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Starting execution...\n",
      "  0%|                                                                                                                                                                                                                                            | 0/25 [11:06<?, ?it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/30 00:13:12 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'Find the value of the integral $\\\\int_S(\\\\nabla \\\\times \\\\mathbf{A}) \\\\cdot d \\\\mathbf{a}$ if the vector $\\\\mathbf{A}=y \\\\mathbf{i}+z \\\\mathbf{j}+x \\\\mathbf{k}$ and $S$ is the surface defined by the paraboloid $z=1-x^2-y^2$, where $z \\\\geq 0$.', 'options': {'A': '$2\\\\pi$', 'B': '$\\\\pi$', 'C': '$\\\\frac{\\\\pi}{2}$', 'D': '$4\\\\pi$', 'E': '$-\\\\frac{\\\\pi}{2}$', 'F': '$-2\\\\pi$', 'G': '$3\\\\pi$', 'H': '$0$', 'I': '$-\\\\pi$', 'J': '$-3\\\\pi$'}, 'answer': ''}) (input_keys={'options', 'question'}): 'list' object has no attribute 'items'\n",
      "Stack trace:\n",
      "Traceback (most recent call last):\n",
      "  File \"/home/justinai/.conda/envs/prompt-migration/lib/python3.10/site-packages/dspy/adapters/base.py\", line 30, in __call__\n",
      "    value = self.parse(signature, output, _parse_values=_parse_values)\n",
      "  File \"/home/justinai/.conda/envs/prompt-migration/lib/python3.10/site-packages/dspy/utils/callback.py\", line 234, in wrapper\n",
      "    return fn(instance, *args, **kwargs)\n",
      "  File \"/home/justinai/.conda/envs/prompt-migration/lib/python3.10/site-packages/dspy/adapters/chat_adapter.py\", line 85, in parse\n",
      "    raise ValueError(f\"Expected {signature.output_fields.keys()} but got {fields.keys()}\")\n",
      "ValueError: Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning'])\n",
      "\n",
      "During handling of the above exception, another exception occurred:\n",
      "\n",
      "Traceback (most recent call last):\n",
      "  File \"/home/justinai/.conda/envs/prompt-migration/lib/python3.10/site-packages/dspy/utils/parallelizer.py\", line 47, in wrapped\n",
      "    return function(item)\n",
      "  File \"/home/justinai/.conda/envs/prompt-migration/lib/python3.10/site-packages/dspy/evaluate/evaluate.py\", line 101, in process_item\n",
      "    prediction = program(**example.inputs())\n",
      "  File \"/home/justinai/.conda/envs/prompt-migration/lib/python3.10/site-packages/dspy/utils/callback.py\", line 234, in wrapper\n",
      "    return fn(instance, *args, **kwargs)\n",
      "  File \"/home/justinai/.conda/envs/prompt-migration/lib/python3.10/site-packages/dspy/primitives/program.py\", line 24, in __call__\n",
      "    return self.forward(*args, **kwargs)\n",
      "  File \"/home/justinai/.conda/envs/prompt-migration/lib/python3.10/site-packages/dspy/predict/chain_of_thought.py\", line 44, in forward\n",
      "    return self._predict(signature=signature, **kwargs)\n",
      "  File \"/home/justinai/.conda/envs/prompt-migration/lib/python3.10/site-packages/dspy/utils/callback.py\", line 234, in wrapper\n",
      "    return fn(instance, *args, **kwargs)\n",
      "  File \"/home/justinai/.conda/envs/prompt-migration/lib/python3.10/site-packages/dspy/predict/predict.py\", line 93, in __call__\n",
      "    return self.forward(**kwargs)\n",
      "  File \"/home/justinai/.conda/envs/prompt-migration/lib/python3.10/site-packages/dspy/predict/predict.py\", line 127, in forward\n",
      "    completions = v2_5_generate(lm, config, signature, demos, kwargs, _parse_values=self._parse_values)\n",
      "  File \"/home/justinai/.conda/envs/prompt-migration/lib/python3.10/site-packages/dspy/predict/predict.py\", line 234, in v2_5_generate\n",
      "    return adapter(\n",
      "  File \"/home/justinai/.conda/envs/prompt-migration/lib/python3.10/site-packages/dspy/adapters/base.py\", line 45, in __call__\n",
      "    return JSONAdapter()(lm, lm_kwargs, signature, demos, inputs, _parse_values=_parse_values)\n",
      "  File \"/home/justinai/.conda/envs/prompt-migration/lib/python3.10/site-packages/dspy/adapters/json_adapter.py\", line 61, in __call__\n",
      "    value = self.parse(signature, output, _parse_values=_parse_values)\n",
      "  File \"/home/justinai/.conda/envs/prompt-migration/lib/python3.10/site-packages/dspy/utils/callback.py\", line 234, in wrapper\n",
      "    return fn(instance, *args, **kwargs)\n",
      "  File \"/home/justinai/.conda/envs/prompt-migration/lib/python3.10/site-packages/dspy/adapters/json_adapter.py\", line 95, in parse\n",
      "    fields = {k: v for k, v in fields.items() if k in signature.output_fields}\n",
      "AttributeError: 'list' object has no attribute 'items'\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 23.00 / 39 (59.0%): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:00<00:00, 2965.01it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/30 00:13:12 INFO dspy.evaluate.evaluate: Average Metric: 23.0 / 40 (57.5%)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>question</th>\n",
       "      <th>options</th>\n",
       "      <th>example_answer</th>\n",
       "      <th>reasoning</th>\n",
       "      <th>pred_answer</th>\n",
       "      <th>metric</th>\n",
       "      <th>answer</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Two xylem plant cell types that provide support and conduct water ...</td>\n",
       "      <td>{'A': 'parenchyma and companion cells', 'B': 'tracheids and vessel...</td>\n",
       "      <td>B</td>\n",
       "      <td>To answer this question, we need to identify the two types of xyle...</td>\n",
       "      <td>B</td>\n",
       "      <td>✔️ [True]</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Calculate the drag force acting on a 0.75 ft × 7 ft smooth platewh...</td>\n",
       "      <td>{'A': '6.9 lb_f', 'B': '14.2 lb_f', 'C': '11.3 lb_f', 'D': '12.4 l...</td>\n",
       "      <td>D</td>\n",
       "      <td>To calculate the drag force acting on the plate, we can use the fo...</td>\n",
       "      <td>A</td>\n",
       "      <td></td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>For which of the following is it appropriate to use a census?</td>\n",
       "      <td>{'A': 'A two-tailed hypothesis test where the null hypothesis was ...</td>\n",
       "      <td>F</td>\n",
       "      <td>A census is appropriate when the goal is to collect data from ever...</td>\n",
       "      <td>J</td>\n",
       "      <td></td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>A housing corporation owned a tract of land and prepared a develop...</td>\n",
       "      <td>{'A': 'The community association has no authority to enforce the D...</td>\n",
       "      <td>E</td>\n",
       "      <td>The best argument for the landscaper is that the annual assessment...</td>\n",
       "      <td>F</td>\n",
       "      <td></td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>A population of students taking a basic genetics class had their b...</td>\n",
       "      <td>{'A': 'p=0.415, q=0.2075, r=0.3133', 'B': 'p=0.2075, q=0.3133, r=0...</td>\n",
       "      <td>F</td>\n",
       "      <td>To find the frequencies of the alleles I^A, I^B, and i, we can use...</td>\n",
       "      <td>D</td>\n",
       "      <td></td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Hydrogen chloride (HCl) is used industrially in the manufacture of...</td>\n",
       "      <td>{'A': '20.5 %', 'B': '19.3 %', 'C': '23.7 %', 'D': '16.8 %', 'E': ...</td>\n",
       "      <td>D</td>\n",
       "      <td>To estimate the percent ionic character of the HCl bond, we can us...</td>\n",
       "      <td>B</td>\n",
       "      <td></td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>What is the smallest whole number that has a remainder of 1 when d...</td>\n",
       "      <td>{'A': '72', 'B': '76', 'C': '67', 'D': '64', 'E': '57', 'F': '45',...</td>\n",
       "      <td>E</td>\n",
       "      <td>To find the smallest whole number that satisfies all the given con...</td>\n",
       "      <td>J</td>\n",
       "      <td></td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>We roll a fair 6-sided die 5 times. What is the probability that w...</td>\n",
       "      <td>{'A': '\\frac{25}{648}', 'B': '\\frac{1000}{648}', 'C': '\\frac{625}{...</td>\n",
       "      <td></td>\n",
       "      <td>To find the probability of getting a 6 in at most 2 of the rolls, ...</td>\n",
       "      <td>C</td>\n",
       "      <td></td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>A bird is lost in a 3 by 3 by 3 cubical maze. The bird flies from ...</td>\n",
       "      <td>{'A': '1.95', 'B': '2.03', 'C': '1.85', 'D': '2.25', 'E': '2.10', ...</td>\n",
       "      <td>A</td>\n",
       "      <td>To find the entropy rate of the random walk, we first need to unde...</td>\n",
       "      <td>A</td>\n",
       "      <td>✔️ [True]</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>How do iconic concepts answer the question of how universals or ge...</td>\n",
       "      <td>{'A': \"Iconic concepts represent 'universals' by dynamically creat...</td>\n",
       "      <td>F</td>\n",
       "      <td>To answer this question, we need to consider how iconic concepts, ...</td>\n",
       "      <td>F</td>\n",
       "      <td>✔️ [True]</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>A tube is heated by means of forced convection from an airstream. ...</td>\n",
       "      <td>{'A': '(a) 1.0 psi, (b) 85000 Btu/hr, (c) 11.2 Btu/hr-ft^2-°F', 'B...</td>\n",
       "      <td>C</td>\n",
       "      <td>To solve this problem, we need to apply the principles of fluid me...</td>\n",
       "      <td>C</td>\n",
       "      <td>✔️ [True]</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>Under the Sales Article of the UCC, which of the following circums...</td>\n",
       "      <td>{'A': 'The seller knows the particular purpose for which the buyer...</td>\n",
       "      <td>F</td>\n",
       "      <td>The implied warranty of fitness for a particular purpose arises wh...</td>\n",
       "      <td>F</td>\n",
       "      <td>✔️ [True]</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>Type I muscle fibres have the following characteristics:</td>\n",
       "      <td>{'A': 'white, glycolytic, fast contracting.', 'B': 'red, oxidative...</td>\n",
       "      <td>H</td>\n",
       "      <td>To answer this question, we need to understand the characteristics...</td>\n",
       "      <td>H</td>\n",
       "      <td>✔️ [True]</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>Francois Quesnay and thePhysiocratsargued twenty years before Adam...</td>\n",
       "      <td>{'A': 'Wealth accumulation is solely based on financial investment...</td>\n",
       "      <td>H</td>\n",
       "      <td>The Physiocrats, led by Francois Quesnay, believed that agricultur...</td>\n",
       "      <td>H</td>\n",
       "      <td>✔️ [True]</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>Find the value of the integral $\\int_S(\\nabla \\times \\mathbf{A}) \\...</td>\n",
       "      <td>{'A': '$2\\pi$', 'B': '$\\pi$', 'C': '$\\frac{\\pi}{2}$', 'D': '$4\\pi$...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>List and compare the tissues that support and hold together theoth...</td>\n",
       "      <td>{'A': \"Endocrine tissue provides the main structural support for t...</td>\n",
       "      <td>G</td>\n",
       "      <td>To answer this question, we need to identify the type of tissue th...</td>\n",
       "      <td>G</td>\n",
       "      <td>✔️ [True]</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>Enhancement of job satisfaction and productivity are key character...</td>\n",
       "      <td>{'A': 'Human Relations theory', 'B': 'Process improvement', 'C': '...</td>\n",
       "      <td>J</td>\n",
       "      <td>The question asks for the theoretical perspective of work design t...</td>\n",
       "      <td>C</td>\n",
       "      <td></td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>Calculate the time needed for a water spill to evaporate into stil...</td>\n",
       "      <td>{'A': '6.2 hr', 'B': '2.0 hr', 'C': '4.0 hr', 'D': '3.25 hr', 'E':...</td>\n",
       "      <td>G</td>\n",
       "      <td>To solve this problem, we can use the concept of molecular diffusi...</td>\n",
       "      <td>A</td>\n",
       "      <td></td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>This question refers to the following information. \"MAHATMA GANDHI...</td>\n",
       "      <td>{'A': 'The spread of the Black Death in Europe', 'B': 'The discove...</td>\n",
       "      <td>J</td>\n",
       "      <td>The instructions to boycott foreign cloth, as indicated in the fif...</td>\n",
       "      <td>J</td>\n",
       "      <td>✔️ [True]</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>A homeowner awoke one night and observed a thief stealing chickens...</td>\n",
       "      <td>{'A': 'Yes, because the thief was committing a crime, and the home...</td>\n",
       "      <td>J</td>\n",
       "      <td>The use of deadly force is generally justified only when there is ...</td>\n",
       "      <td>J</td>\n",
       "      <td>✔️ [True]</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>A centrifugal fan with an efficiency of 60% is used to pump fluega...</td>\n",
       "      <td>{'A': '17.92 hp', 'B': '13.81 hp', 'C': '25.76 hp', 'D': '8.67 hp'...</td>\n",
       "      <td>A</td>\n",
       "      <td>To determine the power needed to drive the fan, we first need to c...</td>\n",
       "      <td>H</td>\n",
       "      <td></td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>If polygon ABCDE ~ polygon PQRST, AB = BC = 8, AE = CD = 4, ED = 6...</td>\n",
       "      <td>{'A': '18', 'B': '27.5', 'C': '33', 'D': '21', 'E': '25', 'F': '22...</td>\n",
       "      <td></td>\n",
       "      <td>Given that polygon ABCDE ~ polygon PQRST, we know that correspondi...</td>\n",
       "      <td>F</td>\n",
       "      <td></td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>A company is interested in comparing the mean sales revenue per sa...</td>\n",
       "      <td>{'A': 'The population of the sales records at each location is not...</td>\n",
       "      <td>B</td>\n",
       "      <td>To determine the necessary assumption for the validity of the t-te...</td>\n",
       "      <td>B</td>\n",
       "      <td>✔️ [True]</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>How many calories of heat are required to raise 1,000 grams of wat...</td>\n",
       "      <td>{'A': '45,000 calories', 'B': '9,000 calories', 'C': '100,000 calo...</td>\n",
       "      <td>H</td>\n",
       "      <td>To solve this problem, we use the formula for heat transfer due to...</td>\n",
       "      <td>H</td>\n",
       "      <td>✔️ [True]</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>Which of the following geometries corresponds to a substance that ...</td>\n",
       "      <td>{'A': 'Bent', 'B': 'Trigonal pyramid', 'C': 'T-shaped', 'D': 'Trig...</td>\n",
       "      <td>G</td>\n",
       "      <td>To solve this, we need to apply the VSEPR (Valence Shell Electron ...</td>\n",
       "      <td>B</td>\n",
       "      <td></td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>Find Antilog_10 0.8762 - 2.</td>\n",
       "      <td>{'A': '8.762', 'B': '0.8762', 'C': '0.0752', 'D': '0.8752', 'E': '...</td>\n",
       "      <td>C</td>\n",
       "      <td>To find Antilog_10 (0.8762 - 2), we first calculate the value insi...</td>\n",
       "      <td>C</td>\n",
       "      <td>✔️ [True]</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>How many labeled graphs with a score of (6, 2, 2, 2, 2, 2, 2) are ...</td>\n",
       "      <td>{'A': '35', 'B': '20', 'C': '25', 'D': '18', 'E': '30', 'F': '24',...</td>\n",
       "      <td>I</td>\n",
       "      <td>To find the number of labeled graphs with a score of (6, 2, 2, 2, ...</td>\n",
       "      <td>A</td>\n",
       "      <td></td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>As of 2019, about what percentage of Turks say God plays an import...</td>\n",
       "      <td>{'A': '59%', 'B': '99%', 'C': '39%', 'D': '19%', 'E': '29%', 'F': ...</td>\n",
       "      <td>F</td>\n",
       "      <td>To answer this question, we need to consider the cultural and reli...</td>\n",
       "      <td>F</td>\n",
       "      <td>✔️ [True]</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>A sample of liquid NH3 is brought to its boiling point. Which of t...</td>\n",
       "      <td>{'A': 'The hydrogen bonds within individual NH3 molecules break ap...</td>\n",
       "      <td>H</td>\n",
       "      <td>To solve this question, we need to understand the process of boili...</td>\n",
       "      <td>H</td>\n",
       "      <td>✔️ [True]</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>Assume all gases are perfect unless stated otherwise. Note that 1 ...</td>\n",
       "      <td>{'A': '+55$\\text{K}$', 'B': '+65$\\text{K}$', 'C': '+42$\\text{K}$',...</td>\n",
       "      <td>E</td>\n",
       "      <td>To find the temperature rise, we can use the formula for heat tran...</td>\n",
       "      <td>E</td>\n",
       "      <td>✔️ [True]</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>Which writer was concerned with the reaction of workers to key cha...</td>\n",
       "      <td>{'A': 'Marx', 'B': 'Lewin', 'C': 'Mayo', 'D': 'Maslow', 'E': 'Freu...</td>\n",
       "      <td>I</td>\n",
       "      <td>The question asks about a writer concerned with the reaction of wo...</td>\n",
       "      <td>C</td>\n",
       "      <td></td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>A male neonate, who was born at 36 weeks' gestation 2 hours ago in...</td>\n",
       "      <td>{'A': 'Pulmonary embolism', 'B': 'Pneumothorax', 'C': 'Pneumonia',...</td>\n",
       "      <td>J</td>\n",
       "      <td>The patient is a male neonate born at 36 weeks' gestation with sig...</td>\n",
       "      <td>J</td>\n",
       "      <td>✔️ [True]</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>Archaeological evidence for the domestication of cats suggests it ...</td>\n",
       "      <td>{'A': 'after 11,000 years ago, as an unintentional consequence of ...</td>\n",
       "      <td>A</td>\n",
       "      <td>The domestication of cats is believed to have occurred as an unint...</td>\n",
       "      <td>A</td>\n",
       "      <td>✔️ [True]</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>Suppose the FED buys $100,000 of government bonds from a commercia...</td>\n",
       "      <td>{'A': '$81,000', 'B': '$720,000', 'C': '$1,100,000', 'D': '$250,00...</td>\n",
       "      <td>G</td>\n",
       "      <td>When the FED buys $100,000 of government bonds from a commercial b...</td>\n",
       "      <td>G</td>\n",
       "      <td>✔️ [True]</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>An air-coretoroidhas 500 turns, a cross-sectional area of 6 cm^2, ...</td>\n",
       "      <td>{'A': '1.9 × 10^-6 A-T/m', 'B': '1,250 A-T/m', 'C': '1,500 A-T/m',...</td>\n",
       "      <td>B</td>\n",
       "      <td>To find the magnetic field of the toroidal coil, we can use the fo...</td>\n",
       "      <td>I</td>\n",
       "      <td></td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>A 22-year-old male presents to the office with a 5-day history of ...</td>\n",
       "      <td>{'A': 'Clostridium difficile', 'B': 'Entamoeba histolytica', 'C': ...</td>\n",
       "      <td>A</td>\n",
       "      <td>The patient's presentation of diarrhea following a course of antib...</td>\n",
       "      <td>A</td>\n",
       "      <td>✔️ [True]</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>Evaluate the series\\n$$\\nS=\\sum_{n=0}^{\\infty} \\frac{1}{3^n}\\n$$</td>\n",
       "      <td>{'A': '2 / 3', 'B': '1 / 2', 'C': '1 / 3', 'D': '3', 'E': '7 / 4',...</td>\n",
       "      <td>J</td>\n",
       "      <td>The given series is a geometric series with the first term \\(a = 1...</td>\n",
       "      <td>J</td>\n",
       "      <td>✔️ [True]</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>Protons of mass 1.67 × 10^-27 kg and moving with a velocity of 2 ×...</td>\n",
       "      <td>{'A': '0.8°C per second', 'B': '1.5°C', 'C': '3.2°C', 'D': '3.6°C ...</td>\n",
       "      <td>J</td>\n",
       "      <td>To find the rate at which the temperature of the target initially ...</td>\n",
       "      <td>H</td>\n",
       "      <td></td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>A riot broke out in a city, and many shops had been looted and som...</td>\n",
       "      <td>{'A': 'affirm both the conviction and the reimbursement order.', '...</td>\n",
       "      <td>E</td>\n",
       "      <td>The defendant's right to self-representation is protected by the S...</td>\n",
       "      <td>E</td>\n",
       "      <td>✔️ [True]</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39</th>\n",
       "      <td>Which of the following statements would \"supply-side\" economists d...</td>\n",
       "      <td>{'A': 'Tax changes cause shifts in aggregate supply that work agai...</td>\n",
       "      <td>E</td>\n",
       "      <td>To answer this question, we need to understand the principles of s...</td>\n",
       "      <td>E</td>\n",
       "      <td>✔️ [True]</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                                 question  \\\n",
       "0   Two xylem plant cell types that provide support and conduct water ...   \n",
       "1   Calculate the drag force acting on a 0.75 ft × 7 ft smooth platewh...   \n",
       "2           For which of the following is it appropriate to use a census?   \n",
       "3   A housing corporation owned a tract of land and prepared a develop...   \n",
       "4   A population of students taking a basic genetics class had their b...   \n",
       "5   Hydrogen chloride (HCl) is used industrially in the manufacture of...   \n",
       "6   What is the smallest whole number that has a remainder of 1 when d...   \n",
       "7   We roll a fair 6-sided die 5 times. What is the probability that w...   \n",
       "8   A bird is lost in a 3 by 3 by 3 cubical maze. The bird flies from ...   \n",
       "9   How do iconic concepts answer the question of how universals or ge...   \n",
       "10  A tube is heated by means of forced convection from an airstream. ...   \n",
       "11  Under the Sales Article of the UCC, which of the following circums...   \n",
       "12               Type I muscle fibres have the following characteristics:   \n",
       "13  Francois Quesnay and thePhysiocratsargued twenty years before Adam...   \n",
       "14  Find the value of the integral $\\int_S(\\nabla \\times \\mathbf{A}) \\...   \n",
       "15  List and compare the tissues that support and hold together theoth...   \n",
       "16  Enhancement of job satisfaction and productivity are key character...   \n",
       "17  Calculate the time needed for a water spill to evaporate into stil...   \n",
       "18  This question refers to the following information. \"MAHATMA GANDHI...   \n",
       "19  A homeowner awoke one night and observed a thief stealing chickens...   \n",
       "20  A centrifugal fan with an efficiency of 60% is used to pump fluega...   \n",
       "21  If polygon ABCDE ~ polygon PQRST, AB = BC = 8, AE = CD = 4, ED = 6...   \n",
       "22  A company is interested in comparing the mean sales revenue per sa...   \n",
       "23  How many calories of heat are required to raise 1,000 grams of wat...   \n",
       "24  Which of the following geometries corresponds to a substance that ...   \n",
       "25                                            Find Antilog_10 0.8762 - 2.   \n",
       "26  How many labeled graphs with a score of (6, 2, 2, 2, 2, 2, 2) are ...   \n",
       "27  As of 2019, about what percentage of Turks say God plays an import...   \n",
       "28  A sample of liquid NH3 is brought to its boiling point. Which of t...   \n",
       "29  Assume all gases are perfect unless stated otherwise. Note that 1 ...   \n",
       "30  Which writer was concerned with the reaction of workers to key cha...   \n",
       "31  A male neonate, who was born at 36 weeks' gestation 2 hours ago in...   \n",
       "32  Archaeological evidence for the domestication of cats suggests it ...   \n",
       "33  Suppose the FED buys $100,000 of government bonds from a commercia...   \n",
       "34  An air-coretoroidhas 500 turns, a cross-sectional area of 6 cm^2, ...   \n",
       "35  A 22-year-old male presents to the office with a 5-day history of ...   \n",
       "36       Evaluate the series\\n$$\\nS=\\sum_{n=0}^{\\infty} \\frac{1}{3^n}\\n$$   \n",
       "37  Protons of mass 1.67 × 10^-27 kg and moving with a velocity of 2 ×...   \n",
       "38  A riot broke out in a city, and many shops had been looted and som...   \n",
       "39  Which of the following statements would \"supply-side\" economists d...   \n",
       "\n",
       "                                                                  options  \\\n",
       "0   {'A': 'parenchyma and companion cells', 'B': 'tracheids and vessel...   \n",
       "1   {'A': '6.9 lb_f', 'B': '14.2 lb_f', 'C': '11.3 lb_f', 'D': '12.4 l...   \n",
       "2   {'A': 'A two-tailed hypothesis test where the null hypothesis was ...   \n",
       "3   {'A': 'The community association has no authority to enforce the D...   \n",
       "4   {'A': 'p=0.415, q=0.2075, r=0.3133', 'B': 'p=0.2075, q=0.3133, r=0...   \n",
       "5   {'A': '20.5 %', 'B': '19.3 %', 'C': '23.7 %', 'D': '16.8 %', 'E': ...   \n",
       "6   {'A': '72', 'B': '76', 'C': '67', 'D': '64', 'E': '57', 'F': '45',...   \n",
       "7   {'A': '\\frac{25}{648}', 'B': '\\frac{1000}{648}', 'C': '\\frac{625}{...   \n",
       "8   {'A': '1.95', 'B': '2.03', 'C': '1.85', 'D': '2.25', 'E': '2.10', ...   \n",
       "9   {'A': \"Iconic concepts represent 'universals' by dynamically creat...   \n",
       "10  {'A': '(a) 1.0 psi, (b) 85000 Btu/hr, (c) 11.2 Btu/hr-ft^2-°F', 'B...   \n",
       "11  {'A': 'The seller knows the particular purpose for which the buyer...   \n",
       "12  {'A': 'white, glycolytic, fast contracting.', 'B': 'red, oxidative...   \n",
       "13  {'A': 'Wealth accumulation is solely based on financial investment...   \n",
       "14  {'A': '$2\\pi$', 'B': '$\\pi$', 'C': '$\\frac{\\pi}{2}$', 'D': '$4\\pi$...   \n",
       "15  {'A': \"Endocrine tissue provides the main structural support for t...   \n",
       "16  {'A': 'Human Relations theory', 'B': 'Process improvement', 'C': '...   \n",
       "17  {'A': '6.2 hr', 'B': '2.0 hr', 'C': '4.0 hr', 'D': '3.25 hr', 'E':...   \n",
       "18  {'A': 'The spread of the Black Death in Europe', 'B': 'The discove...   \n",
       "19  {'A': 'Yes, because the thief was committing a crime, and the home...   \n",
       "20  {'A': '17.92 hp', 'B': '13.81 hp', 'C': '25.76 hp', 'D': '8.67 hp'...   \n",
       "21  {'A': '18', 'B': '27.5', 'C': '33', 'D': '21', 'E': '25', 'F': '22...   \n",
       "22  {'A': 'The population of the sales records at each location is not...   \n",
       "23  {'A': '45,000 calories', 'B': '9,000 calories', 'C': '100,000 calo...   \n",
       "24  {'A': 'Bent', 'B': 'Trigonal pyramid', 'C': 'T-shaped', 'D': 'Trig...   \n",
       "25  {'A': '8.762', 'B': '0.8762', 'C': '0.0752', 'D': '0.8752', 'E': '...   \n",
       "26  {'A': '35', 'B': '20', 'C': '25', 'D': '18', 'E': '30', 'F': '24',...   \n",
       "27  {'A': '59%', 'B': '99%', 'C': '39%', 'D': '19%', 'E': '29%', 'F': ...   \n",
       "28  {'A': 'The hydrogen bonds within individual NH3 molecules break ap...   \n",
       "29  {'A': '+55$\\text{K}$', 'B': '+65$\\text{K}$', 'C': '+42$\\text{K}$',...   \n",
       "30  {'A': 'Marx', 'B': 'Lewin', 'C': 'Mayo', 'D': 'Maslow', 'E': 'Freu...   \n",
       "31  {'A': 'Pulmonary embolism', 'B': 'Pneumothorax', 'C': 'Pneumonia',...   \n",
       "32  {'A': 'after 11,000 years ago, as an unintentional consequence of ...   \n",
       "33  {'A': '$81,000', 'B': '$720,000', 'C': '$1,100,000', 'D': '$250,00...   \n",
       "34  {'A': '1.9 × 10^-6 A-T/m', 'B': '1,250 A-T/m', 'C': '1,500 A-T/m',...   \n",
       "35  {'A': 'Clostridium difficile', 'B': 'Entamoeba histolytica', 'C': ...   \n",
       "36  {'A': '2 / 3', 'B': '1 / 2', 'C': '1 / 3', 'D': '3', 'E': '7 / 4',...   \n",
       "37  {'A': '0.8°C per second', 'B': '1.5°C', 'C': '3.2°C', 'D': '3.6°C ...   \n",
       "38  {'A': 'affirm both the conviction and the reimbursement order.', '...   \n",
       "39  {'A': 'Tax changes cause shifts in aggregate supply that work agai...   \n",
       "\n",
       "   example_answer  \\\n",
       "0               B   \n",
       "1               D   \n",
       "2               F   \n",
       "3               E   \n",
       "4               F   \n",
       "5               D   \n",
       "6               E   \n",
       "7                   \n",
       "8               A   \n",
       "9               F   \n",
       "10              C   \n",
       "11              F   \n",
       "12              H   \n",
       "13              H   \n",
       "14            NaN   \n",
       "15              G   \n",
       "16              J   \n",
       "17              G   \n",
       "18              J   \n",
       "19              J   \n",
       "20              A   \n",
       "21                  \n",
       "22              B   \n",
       "23              H   \n",
       "24              G   \n",
       "25              C   \n",
       "26              I   \n",
       "27              F   \n",
       "28              H   \n",
       "29              E   \n",
       "30              I   \n",
       "31              J   \n",
       "32              A   \n",
       "33              G   \n",
       "34              B   \n",
       "35              A   \n",
       "36              J   \n",
       "37              J   \n",
       "38              E   \n",
       "39              E   \n",
       "\n",
       "                                                                reasoning  \\\n",
       "0   To answer this question, we need to identify the two types of xyle...   \n",
       "1   To calculate the drag force acting on the plate, we can use the fo...   \n",
       "2   A census is appropriate when the goal is to collect data from ever...   \n",
       "3   The best argument for the landscaper is that the annual assessment...   \n",
       "4   To find the frequencies of the alleles I^A, I^B, and i, we can use...   \n",
       "5   To estimate the percent ionic character of the HCl bond, we can us...   \n",
       "6   To find the smallest whole number that satisfies all the given con...   \n",
       "7   To find the probability of getting a 6 in at most 2 of the rolls, ...   \n",
       "8   To find the entropy rate of the random walk, we first need to unde...   \n",
       "9   To answer this question, we need to consider how iconic concepts, ...   \n",
       "10  To solve this problem, we need to apply the principles of fluid me...   \n",
       "11  The implied warranty of fitness for a particular purpose arises wh...   \n",
       "12  To answer this question, we need to understand the characteristics...   \n",
       "13  The Physiocrats, led by Francois Quesnay, believed that agricultur...   \n",
       "14                                                                    NaN   \n",
       "15  To answer this question, we need to identify the type of tissue th...   \n",
       "16  The question asks for the theoretical perspective of work design t...   \n",
       "17  To solve this problem, we can use the concept of molecular diffusi...   \n",
       "18  The instructions to boycott foreign cloth, as indicated in the fif...   \n",
       "19  The use of deadly force is generally justified only when there is ...   \n",
       "20  To determine the power needed to drive the fan, we first need to c...   \n",
       "21  Given that polygon ABCDE ~ polygon PQRST, we know that correspondi...   \n",
       "22  To determine the necessary assumption for the validity of the t-te...   \n",
       "23  To solve this problem, we use the formula for heat transfer due to...   \n",
       "24  To solve this, we need to apply the VSEPR (Valence Shell Electron ...   \n",
       "25  To find Antilog_10 (0.8762 - 2), we first calculate the value insi...   \n",
       "26  To find the number of labeled graphs with a score of (6, 2, 2, 2, ...   \n",
       "27  To answer this question, we need to consider the cultural and reli...   \n",
       "28  To solve this question, we need to understand the process of boili...   \n",
       "29  To find the temperature rise, we can use the formula for heat tran...   \n",
       "30  The question asks about a writer concerned with the reaction of wo...   \n",
       "31  The patient is a male neonate born at 36 weeks' gestation with sig...   \n",
       "32  The domestication of cats is believed to have occurred as an unint...   \n",
       "33  When the FED buys $100,000 of government bonds from a commercial b...   \n",
       "34  To find the magnetic field of the toroidal coil, we can use the fo...   \n",
       "35  The patient's presentation of diarrhea following a course of antib...   \n",
       "36  The given series is a geometric series with the first term \\(a = 1...   \n",
       "37  To find the rate at which the temperature of the target initially ...   \n",
       "38  The defendant's right to self-representation is protected by the S...   \n",
       "39  To answer this question, we need to understand the principles of s...   \n",
       "\n",
       "   pred_answer     metric answer  \n",
       "0            B  ✔️ [True]    NaN  \n",
       "1            A               NaN  \n",
       "2            J               NaN  \n",
       "3            F               NaN  \n",
       "4            D               NaN  \n",
       "5            B               NaN  \n",
       "6            J               NaN  \n",
       "7            C               NaN  \n",
       "8            A  ✔️ [True]    NaN  \n",
       "9            F  ✔️ [True]    NaN  \n",
       "10           C  ✔️ [True]    NaN  \n",
       "11           F  ✔️ [True]    NaN  \n",
       "12           H  ✔️ [True]    NaN  \n",
       "13           H  ✔️ [True]    NaN  \n",
       "14         NaN                    \n",
       "15           G  ✔️ [True]    NaN  \n",
       "16           C               NaN  \n",
       "17           A               NaN  \n",
       "18           J  ✔️ [True]    NaN  \n",
       "19           J  ✔️ [True]    NaN  \n",
       "20           H               NaN  \n",
       "21           F               NaN  \n",
       "22           B  ✔️ [True]    NaN  \n",
       "23           H  ✔️ [True]    NaN  \n",
       "24           B               NaN  \n",
       "25           C  ✔️ [True]    NaN  \n",
       "26           A               NaN  \n",
       "27           F  ✔️ [True]    NaN  \n",
       "28           H  ✔️ [True]    NaN  \n",
       "29           E  ✔️ [True]    NaN  \n",
       "30           C               NaN  \n",
       "31           J  ✔️ [True]    NaN  \n",
       "32           A  ✔️ [True]    NaN  \n",
       "33           G  ✔️ [True]    NaN  \n",
       "34           I               NaN  \n",
       "35           A  ✔️ [True]    NaN  \n",
       "36           J  ✔️ [True]    NaN  \n",
       "37           H               NaN  \n",
       "38           E  ✔️ [True]    NaN  \n",
       "39           E  ✔️ [True]    NaN  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 598 ms, sys: 8.98 ms, total: 607 ms\n",
      "Wall time: 598 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "print(\"Starting execution...\")\n",
    "# eval_subset_size = len(testset)\n",
    "score, results, all_scores = evaluate(\n",
    "    program,\n",
    "    devset=testset[:40],\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "import csv\n",
    "\n",
    "with open(\"my_results.csv\", mode=\"w\", newline=\"\", encoding=\"utf-8\") as f:\n",
    "    writer = csv.writer(f)\n",
    "    # If `results` is a list of lists, just write rows:\n",
    "    for row in results:\n",
    "        writer.writerow(row)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Results DataFrame:\n",
      "    Example_Index  \\\n",
      "0               0   \n",
      "1               1   \n",
      "2               2   \n",
      "3               3   \n",
      "4               4   \n",
      "5               5   \n",
      "6               6   \n",
      "7               7   \n",
      "8               8   \n",
      "9               9   \n",
      "10             10   \n",
      "11             11   \n",
      "12             12   \n",
      "13             13   \n",
      "14             14   \n",
      "15             15   \n",
      "16             16   \n",
      "17             17   \n",
      "18             18   \n",
      "19             19   \n",
      "20             20   \n",
      "21             21   \n",
      "22             22   \n",
      "23             23   \n",
      "24             24   \n",
      "25             25   \n",
      "26             26   \n",
      "27             27   \n",
      "28             28   \n",
      "29             29   \n",
      "30             30   \n",
      "31             31   \n",
      "32             32   \n",
      "33             33   \n",
      "34             34   \n",
      "35             35   \n",
      "36             36   \n",
      "37             37   \n",
      "38             38   \n",
      "39             39   \n",
      "40             40   \n",
      "41             41   \n",
      "42             42   \n",
      "43             43   \n",
      "44             44   \n",
      "45             45   \n",
      "46             46   \n",
      "47             47   \n",
      "48             48   \n",
      "49             49   \n",
      "50             50   \n",
      "51             51   \n",
      "52             52   \n",
      "53             53   \n",
      "54             54   \n",
      "55             55   \n",
      "56             56   \n",
      "57             57   \n",
      "58             58   \n",
      "\n",
      "                                                               Prediction  \\\n",
      "0   Prediction(\\n    reasoning=\"To solve this problem, we need to appl...   \n",
      "1   Prediction(\\n    reasoning='The capacitance of a parallel-plate ca...   \n",
      "2   Prediction(\\n    reasoning=\"To calculate the contribution to $U_{\\...   \n",
      "3   Prediction(\\n    reasoning='To translate the given statement \"Abdu...   \n",
      "4   Prediction(\\n    reasoning='The passage states that perestroika, o...   \n",
      "5   Prediction(\\n    reasoning='The refractory period is a phase that ...   \n",
      "6   Prediction(\\n    reasoning=\"To solve this problem, we need to cons...   \n",
      "7   Prediction(\\n    reasoning=\"To find the weight of one truck, we fi...   \n",
      "8   Prediction(\\n    reasoning='To estimate the magnitude of the elect...   \n",
      "9   Prediction(\\n    reasoning=\"The patient presents with nausea, vomi...   \n",
      "10  Prediction(\\n    reasoning=\"The builder properly rejected the ship...   \n",
      "11  Prediction(\\n    reasoning=\"The man conveyed a right-of-way easeme...   \n",
      "12  Prediction(\\n    reasoning='To answer this question, we need to co...   \n",
      "13  Prediction(\\n    reasoning='To answer this question, we need to un...   \n",
      "14  Prediction(\\n    reasoning='To determine the final angular momentu...   \n",
      "15  Prediction(\\n    reasoning='To calculate the molecular weight of t...   \n",
      "16  Prediction(\\n    reasoning=\"The appropriateness and usefulness of ...   \n",
      "17  Prediction(\\n    reasoning='To find the angular magnification of t...   \n",
      "18  Prediction(\\n    reasoning='Multiple-choice questions are often ea...   \n",
      "19  Prediction(\\n    reasoning='The question describes a treatment pro...   \n",
      "20  Prediction(\\n    reasoning='To solve this problem, we need to use ...   \n",
      "21  Prediction(\\n    reasoning=\"Since the DNA molecule is 20 percent a...   \n",
      "22  Prediction(\\n    reasoning='The process described has NADP+, ADP, ...   \n",
      "23  Prediction(\\n    reasoning='The question describes a scenario wher...   \n",
      "24  Prediction(\\n    reasoning='The lac operon is a genetic regulatory...   \n",
      "25  Prediction(\\n    reasoning='To answer this question, we need to id...   \n",
      "26  Prediction(\\n    reasoning='To find the total tax Mr. Howard pays,...   \n",
      "27  Prediction(\\n    reasoning='The amplitude of a damped harmonic osc...   \n",
      "28  Prediction(\\n    reasoning='The statement \"Men are better drivers ...   \n",
      "29  Prediction(\\n    reasoning='To find the desired level of money bal...   \n",
      "30  Prediction(\\n    reasoning='The clinical presentation described fo...   \n",
      "31  Prediction(\\n    reasoning=\"To solve this problem, we first need t...   \n",
      "32  Prediction(\\n    reasoning='To determine where Mr. Balfour should ...   \n",
      "33  Prediction(\\n    reasoning=\"To solve this problem, we need to unde...   \n",
      "34  Prediction(\\n    reasoning='The patient presents with symptoms of ...   \n",
      "35  Prediction(\\n    reasoning='The \"sex-change\" in bacteria is relate...   \n",
      "36  Prediction(\\n    reasoning='To answer this question, we need to co...   \n",
      "37  Prediction(\\n    reasoning=\"To determine if the burger joint owner...   \n",
      "38  Prediction(\\n    reasoning=\"To find the pH of a 0.1 M solution of ...   \n",
      "39  Prediction(\\n    reasoning='The question describes a scenario wher...   \n",
      "40  Prediction(\\n    reasoning='The common law principles regarding th...   \n",
      "41  Prediction(\\n    reasoning='To find the concentration of the sodiu...   \n",
      "42  Prediction(\\n    reasoning='LC oscillators, also known as tank cir...   \n",
      "43  Prediction(\\n    reasoning='To determine which of the given proces...   \n",
      "44  Prediction(\\n    reasoning=\"The marginal utility approach explains...   \n",
      "45  Prediction(\\n    reasoning=\"To solve this problem, we need to calc...   \n",
      "46  Prediction(\\n    reasoning=\"The correct answer is based on the pri...   \n",
      "47  Prediction(\\n    reasoning=\"To find the posterior probability \\\\(P...   \n",
      "48  Prediction(\\n    reasoning=\"To determine the day of the week on wh...   \n",
      "49  Prediction(\\n    reasoning=\"To solve this problem, we need to unde...   \n",
      "50  Prediction(\\n    reasoning='To calculate the total change in entro...   \n",
      "51  Prediction(\\n    reasoning='To determine when the tenant\\'s statut...   \n",
      "52  Prediction(\\n    reasoning=\"To solve this problem, we first need t...   \n",
      "53  Prediction(\\n    reasoning=\"To determine which type of organisatio...   \n",
      "54  Prediction(\\n    reasoning='To answer this question, we need to un...   \n",
      "55  Prediction(\\n    reasoning=\"To estimate a numerical measurement of...   \n",
      "56  Prediction(\\n    reasoning='Anscombe\\'s work focuses on the philos...   \n",
      "57  Prediction(\\n    reasoning=\"The practice of requiring students to ...   \n",
      "58  Prediction(\\n    reasoning=\"The production period in the history o...   \n",
      "\n",
      "    Score  \n",
      "0    True  \n",
      "1    True  \n",
      "2   False  \n",
      "3    True  \n",
      "4    True  \n",
      "5    True  \n",
      "6    True  \n",
      "7    True  \n",
      "8   False  \n",
      "9    True  \n",
      "10   True  \n",
      "11   True  \n",
      "12   True  \n",
      "13   True  \n",
      "14   True  \n",
      "15  False  \n",
      "16   True  \n",
      "17   True  \n",
      "18   True  \n",
      "19   True  \n",
      "20  False  \n",
      "21  False  \n",
      "22   True  \n",
      "23   True  \n",
      "24   True  \n",
      "25   True  \n",
      "26   True  \n",
      "27  False  \n",
      "28   True  \n",
      "29   True  \n",
      "30   True  \n",
      "31  False  \n",
      "32   True  \n",
      "33  False  \n",
      "34   True  \n",
      "35   True  \n",
      "36  False  \n",
      "37   True  \n",
      "38  False  \n",
      "39   True  \n",
      "40   True  \n",
      "41   True  \n",
      "42  False  \n",
      "43   True  \n",
      "44   True  \n",
      "45   True  \n",
      "46   True  \n",
      "47   True  \n",
      "48  False  \n",
      "49   True  \n",
      "50   True  \n",
      "51   True  \n",
      "52  False  \n",
      "53   True  \n",
      "54   True  \n",
      "55   True  \n",
      "56   True  \n",
      "57  False  \n",
      "58  False  \n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.DataFrame([(i, str(pred), score) for i, (example, pred, score) in enumerate(results)],\n",
    "                     columns=['Example_Index', 'Prediction', 'Score'])\n",
    "print(\"\\nResults DataFrame:\")\n",
    "print(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "comparison_df = pd.DataFrame([{\n",
    "        'Question': example.inputs()['question'][:100] + '...' if len(example.inputs()['question']) > 100 else example.inputs()['question'],\n",
    "        'Predicted Answer': prediction.answer if hasattr(prediction, 'answer') else str(prediction),\n",
    "        'Correct Answer': example.answer,\n",
    "        'Is Correct': '✔️' if score else '❌'\n",
    "} for example, prediction, score in results])\n",
    "\n",
    "\n",
    "csv_filename = 'prediction_results.csv'\n",
    "comparison_df.to_csv(csv_filename, index=False, encoding='utf-8-sig')  # utf-8-sig to handle special characters\n",
    "print(f\"\\nResults saved to {csv_filename}\")\n",
    "    \n",
    "pd.set_option('display.max_colwidth', None)\n",
    "print(\"\\nPredictions vs Actual Answers:\")\n",
    "print(comparison_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "print(\"Starting execution...\")\n",
    "evaluate(\n",
    "    program,\n",
    "    devset=testset,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Optimize Subset + Evaluation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import logging\n",
    "logging.getLogger('dspy').setLevel(logging.DEBUG)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/30 01:17:12 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
      "RUNNING WITH THE FOLLOWING LIGHT AUTO RUN SETTINGS:\n",
      "num_trials: 7\n",
      "minibatch: True\n",
      "num_candidates: 5\n",
      "valset size: 100\n",
      "\n",
      "2025/01/30 01:17:12 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
      "==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==\n",
      "2025/01/30 01:17:12 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used for informing instruction proposal.\n",
      "\n",
      "2025/01/30 01:17:12 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=5 sets of demonstrations...\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapping set 1/5\n",
      "Bootstrapping set 2/5\n",
      "Bootstrapping set 3/5\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|                                                                                                                                                                                                                                           | 0/200 [00:00<?, ?it/s]\n",
      "2025/01/30 01:17:12 INFO dspy.teleprompt.mipro_optimizer_v2: Error generating few-shot examples: empty range for randrange() (1, 1, 0)\n",
      "2025/01/30 01:17:12 INFO dspy.teleprompt.mipro_optimizer_v2: Running without few-shot examples.\n",
      "2025/01/30 01:17:12 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
      "==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==\n",
      "2025/01/30 01:17:12 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 0 full traces after 0 examples for up to 1 rounds, amounting to 0 attempts.\n",
      "Bootstrapping set 4/5\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/30 01:17:28 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
      "Proposing instructions...\n",
      "\n",
      "2025/01/30 01:17:46 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:\n",
      "\n",
      "2025/01/30 01:17:46 INFO dspy.teleprompt.mipro_optimizer_v2: 0: You are a knowledgeable and insightful assistant designed to provide guidance on multiple-choice questions. Your role involves analyzing questions, assessing answer choices, and offering well-reasoned explanations to enhance understanding. By breaking down complex concepts, you help users develop critical thinking skills and improve their decision-making process. You strive to present information in a clear, structured manner while adapting to the user's level of expertise. Ultimately, your goal is to foster deeper comprehension and confidence in tackling multiple-choice assessments.\n",
      "\n",
      "2025/01/30 01:17:46 INFO dspy.teleprompt.mipro_optimizer_v2: 1: Analyze the given multiple-choice question and options, then provide a step-by-step reasoning process to determine the correct answer. Ensure the explanation is clear, concise, and tailored to the user's level of expertise, enhancing their understanding and critical thinking skills.\n",
      "\n",
      "2025/01/30 01:17:46 INFO dspy.teleprompt.mipro_optimizer_v2: 2: You are a seasoned educator with expertise in multiple subjects, including law, mathematics, history, and sociology. Your role is to guide students through complex multiple-choice questions, providing detailed explanations and step-by-step reasoning to help them understand the underlying concepts. When presented with a question and a set of answer choices, carefully analyze the options, identify the most relevant information, and apply critical thinking skills to arrive at the correct answer. Your explanations should be clear, concise, and tailored to the student's level of understanding, with the goal of enhancing their critical thinking skills, fostering deeper comprehension, and building confidence in tackling multiple-choice assessments.\n",
      "\n",
      "2025/01/30 01:17:46 INFO dspy.teleprompt.mipro_optimizer_v2: 3: To tackle this multiple-choice question effectively, let's break it down into manageable parts. First, carefully read the question to understand what is being asked, identifying any key terms or concepts that are crucial to the inquiry. Next, examine each answer choice, evaluating the relevance and accuracy of each option in relation to the question. Consider the context and any underlying principles or rules that might apply. Then, systematically analyze each choice against the question, eliminating options that are clearly incorrect or less relevant. As you narrow down the possibilities, think critically about the remaining choices, applying logical reasoning and any applicable knowledge from the subject area to determine the most appropriate answer. Finally, select the answer that best aligns with your analysis and provide a clear, step-by-step explanation of your reasoning process, ensuring that your thought process is transparent and easy to follow. This approach will not only help in selecting the correct answer but also in understanding the underlying concepts and principles, thereby enhancing your critical thinking skills and confidence in tackling similar questions in the future.\n",
      "\n",
      "2025/01/30 01:17:46 INFO dspy.teleprompt.mipro_optimizer_v2: 4: To tackle this multiple-choice question effectively, I will carefully analyze the given question, assess each answer choice, and provide a well-structured explanation to justify the correct answer. My approach involves breaking down complex concepts into manageable parts, evaluating the relevance and accuracy of each option, and selecting the most appropriate answer based on logical reasoning and contextual understanding. I will present my thought process in a clear and step-by-step manner, ensuring that the explanation is easy to follow and understand, regardless of the user's level of expertise. By doing so, I aim to not only provide the correct answer but also to enhance the user's critical thinking skills and confidence in approaching similar questions in the future.\n",
      "\n",
      "2025/01/30 01:17:46 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
      "\n",
      "2025/01/30 01:17:46 INFO dspy.teleprompt.mipro_optimizer_v2: Evaluating the default program...\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average Metric: 88.00 / 100 (88.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:46<00:00,  2.16it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/30 01:18:32 INFO dspy.evaluate.evaluate: Average Metric: 88 / 100 (88.0%)\n",
      "2025/01/30 01:18:32 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 88.0\n",
      "\n",
      "2025/01/30 01:18:32 INFO dspy.teleprompt.mipro_optimizer_v2: ==> STEP 3: FINDING OPTIMAL PROMPT PARAMETERS <==\n",
      "2025/01/30 01:18:32 INFO dspy.teleprompt.mipro_optimizer_v2: We will evaluate the program over a series of trials with different combinations of instructions and few-shot examples to find the optimal combination using Bayesian Optimization.\n",
      "\n",
      "2025/01/30 01:18:32 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 1 / 7 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 24.00 / 25 (96.0%): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:13<00:00,  1.89it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/30 01:18:46 INFO dspy.evaluate.evaluate: Average Metric: 24 / 25 (96.0%)\n",
      "2025/01/30 01:18:46 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 96.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 1'].\n",
      "2025/01/30 01:18:46 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [96.0]\n",
      "2025/01/30 01:18:46 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [88.0]\n",
      "2025/01/30 01:18:46 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 88.0\n",
      "2025/01/30 01:18:46 INFO dspy.teleprompt.mipro_optimizer_v2: ===========================\n",
      "\n",
      "\n",
      "2025/01/30 01:18:46 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 2 / 7 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 23.00 / 25 (92.0%): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:11<00:00,  2.12it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/30 01:18:58 INFO dspy.evaluate.evaluate: Average Metric: 23 / 25 (92.0%)\n",
      "2025/01/30 01:18:58 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 92.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 1'].\n",
      "2025/01/30 01:18:58 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [96.0, 92.0]\n",
      "2025/01/30 01:18:58 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [88.0]\n",
      "2025/01/30 01:18:58 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 88.0\n",
      "2025/01/30 01:18:58 INFO dspy.teleprompt.mipro_optimizer_v2: ===========================\n",
      "\n",
      "\n",
      "2025/01/30 01:18:58 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 3 / 7 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 23.00 / 25 (92.0%): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:14<00:00,  1.75it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/30 01:19:12 INFO dspy.evaluate.evaluate: Average Metric: 23 / 25 (92.0%)\n",
      "2025/01/30 01:19:12 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 92.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 2'].\n",
      "2025/01/30 01:19:12 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [96.0, 92.0, 92.0]\n",
      "2025/01/30 01:19:12 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [88.0]\n",
      "2025/01/30 01:19:12 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 88.0\n",
      "2025/01/30 01:19:12 INFO dspy.teleprompt.mipro_optimizer_v2: ===========================\n",
      "\n",
      "\n",
      "2025/01/30 01:19:12 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 4 / 7 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 22.00 / 25 (88.0%): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:14<00:00,  1.69it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/30 01:19:27 INFO dspy.evaluate.evaluate: Average Metric: 22 / 25 (88.0%)\n",
      "2025/01/30 01:19:27 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 88.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 1'].\n",
      "2025/01/30 01:19:27 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [96.0, 92.0, 92.0, 88.0]\n",
      "2025/01/30 01:19:27 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [88.0]\n",
      "2025/01/30 01:19:27 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 88.0\n",
      "2025/01/30 01:19:27 INFO dspy.teleprompt.mipro_optimizer_v2: ===========================\n",
      "\n",
      "\n",
      "2025/01/30 01:19:27 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 5 / 7 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 22.00 / 25 (88.0%): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:16<00:00,  1.49it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/30 01:19:44 INFO dspy.evaluate.evaluate: Average Metric: 22 / 25 (88.0%)\n",
      "2025/01/30 01:19:44 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 88.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 4'].\n",
      "2025/01/30 01:19:44 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [96.0, 92.0, 92.0, 88.0, 88.0]\n",
      "2025/01/30 01:19:44 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [88.0]\n",
      "2025/01/30 01:19:44 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 88.0\n",
      "2025/01/30 01:19:44 INFO dspy.teleprompt.mipro_optimizer_v2: ===========================\n",
      "\n",
      "\n",
      "2025/01/30 01:19:44 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 6 / 7 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 22.00 / 25 (88.0%): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:18<00:00,  1.35it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/30 01:20:02 INFO dspy.evaluate.evaluate: Average Metric: 22 / 25 (88.0%)\n",
      "2025/01/30 01:20:02 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 88.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 1'].\n",
      "2025/01/30 01:20:02 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [96.0, 92.0, 92.0, 88.0, 88.0, 88.0]\n",
      "2025/01/30 01:20:02 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [88.0]\n",
      "2025/01/30 01:20:02 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 88.0\n",
      "2025/01/30 01:20:02 INFO dspy.teleprompt.mipro_optimizer_v2: ===========================\n",
      "\n",
      "\n",
      "2025/01/30 01:20:02 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 7 / 7 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 23.00 / 25 (92.0%): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:19<00:00,  1.30it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/30 01:20:22 INFO dspy.evaluate.evaluate: Average Metric: 23 / 25 (92.0%)\n",
      "2025/01/30 01:20:22 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 92.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 2'].\n",
      "2025/01/30 01:20:22 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [96.0, 92.0, 92.0, 88.0, 88.0, 88.0, 92.0]\n",
      "2025/01/30 01:20:22 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [88.0]\n",
      "2025/01/30 01:20:22 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 88.0\n",
      "2025/01/30 01:20:22 INFO dspy.teleprompt.mipro_optimizer_v2: ===========================\n",
      "\n",
      "\n",
      "2025/01/30 01:20:22 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Full Eval 1 =====\n",
      "2025/01/30 01:20:22 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 92.0) from minibatch trials...\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 89.00 / 99 (89.9%):  99%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████  | 99/100 [00:21<00:01,  1.62s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/30 01:21:11 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'A 30 mF capacitor has 6 millicoulombs of charge on each plate. The energy stored in the capacitor is most nearly', 'options': {'A': '5.4 × 10–10 J', 'B': '9.0 × 10–8 J', 'C': '0.6 J', 'D': '12.5 J'}, 'answer': 'C'}) (input_keys={'options', 'question'}): Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']). Set `provide_traceback=True` to see the stack trace.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average Metric: 89.00 / 99 (89.9%): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:48<00:00,  2.05it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/30 01:21:11 INFO dspy.evaluate.evaluate: Average Metric: 89.0 / 100 (89.0%)\n",
      "2025/01/30 01:21:11 INFO dspy.teleprompt.mipro_optimizer_v2: \u001b[92mNew best full eval score!\u001b[0m Score: 89.0\n",
      "2025/01/30 01:21:11 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [88.0, 89.0]\n",
      "2025/01/30 01:21:11 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 89.0\n",
      "2025/01/30 01:21:11 INFO dspy.teleprompt.mipro_optimizer_v2: =======================\n",
      "2025/01/30 01:21:11 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
      "\n",
      "2025/01/30 01:21:11 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 89.0!\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "CPU times: user 4.02 s, sys: 932 ms, total: 4.95 s\n",
      "Wall time: 3min 59s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "subset_size = 200\n",
    "optimizer = dspy.MIPROv2(\n",
    "    metric=benchmark.metric,\n",
    "    auto=\"light\",\n",
    "    num_threads=NUM_THREADS,\n",
    "    task_model=TASK_MODEL,\n",
    "    prompt_model=PROMPT_MODEL,\n",
    "    max_labeled_demos=FEW_SHOTS,\n",
    "    max_bootstrapped_demos=0,\n",
    ")\n",
    "\n",
    "optimized_program = optimizer.compile(\n",
    "    program,\n",
    "    trainset=trainset[:subset_size],\n",
    "    valset=valset[:subset_size],\n",
    "    requires_permission_to_run=False,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "BEST PROMPT:\n",
      " You are a knowledgeable and insightful assistant designed to provide guidance on multiple-choice questions. Your role involves analyzing questions, assessing answer choices, and offering well-reasoned explanations to enhance understanding. By breaking down complex concepts, you help users develop critical thinking skills and improve their decision-making process. You strive to present information in a clear, structured manner while adapting to the user's level of expertise. Ultimately, your goal is to foster deeper comprehension and confidence in tackling multiple-choice assessments.\n",
      "CPU times: user 254 μs, sys: 22 μs, total: 276 μs\n",
      "Wall time: 238 μs\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "print(\"BEST PROMPT:\\n\", optimized_program.signature.instructions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for module_name, module in optimized_program.__dict__.items():\n",
    "    if hasattr(module, 'signature'):\n",
    "        print(f\"\\nModule {module_name} instructions:\")\n",
    "        print(module.signature.instructions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "BEST EXAMPLES:\n",
      " []\n"
     ]
    }
   ],
   "source": [
    "print(\"BEST EXAMPLES:\\n\", optimized_program.demos)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "BEST EXAMPLES:\n",
      " [\n",
      "  {\n",
      "    \"question\": \"Dr. Ryan is a psychotherapist in a small town. She receives a phone call from a man she was sexually involved with several years ago. The relationship lasted only three months and ended amicably. The man is now married and is having problems unrelated to their past relationship. He has called to see if he can begin seeing Dr. Ryan in therapy. Dr. Ryan should:\",\n",
      "    \"options\": {\n",
      "      \"A\": \"see the man but disclose their past relationship to a supervisor or colleague for transparency.\",\n",
      "      \"B\": \"deny the man's request without providing a reason.\",\n",
      "      \"C\": \"see the man only if their relationship ended more than two years ago and she determines that it will not interfere with her objectivity.\",\n",
      "      \"D\": \"see the man only if she feels she can maintain professional boundaries despite their past.\",\n",
      "      \"E\": \"refer the man to a colleague.\",\n",
      "      \"F\": \"continue therapy with the man but avoid discussing their past relationship.\",\n",
      "      \"G\": \"see the man only if she determines that their past relationship will not interfere with her objectivity.\",\n",
      "      \"H\": \"provide therapy for the man over the phone or through online sessions to maintain physical boundaries.\",\n",
      "      \"I\": \"see the man only if his wife is aware of their past relationship and consents to the therapy.\",\n",
      "      \"J\": \"see the man but discuss the potential for con\\ufb02icts with him before beginning treatment.\"\n",
      "    },\n",
      "    \"answer\": \"E\",\n",
      "    \"reasoning\": \"The American Psychological Association's Ethics Code states that psychologists should refrain from entering into a therapeutic relationship with individuals with whom they have a personal relationship that could impair their professional judgment or create a risk of exploitation. In this case, Dr. Ryan's past sexual relationship with the man could potentially create a dual relationship, which may impair her objectivity and professional judgment. While the relationship ended amicably and was several years ago, it is still important for Dr. Ryan to consider whether her past involvement with the man could influence her ability to provide unbiased and effective therapy. The most appropriate course of action would be for Dr. Ryan to refer the man to a colleague, as this would avoid any potential conflict of interest or dual relationship. This approach prioritizes the man's need for therapy while also maintaining the ethical standards of the profession.\",\n",
      "    \"input_keys\": null\n",
      "  },\n",
      "  {\n",
      "    \"question\": \"A writer sent a four-page synopsis of an idea for a new television series to a Hollywood producer. He sent it in response to an ad for new ideas in an industry publication. He discussed it with the producer's assistant in a phone call, and mentioned his expectation of compensation. She said, \\\"Well, of course, we always pay for a writer's work.\\\" She said she would go over it with her boss to see if he liked it. Several months later, the writer saw a casting call for a new series. The plot and characters were nearly identical to those described in his synopsis. He sued the producer for breach of contract. The producer defended by arguing that there was no contract. What is the likely ruling of the court?\",\n",
      "    \"options\": {\n",
      "      \"A\": \"The court will rule that there can be no contract as the writer didn't explicitly state his expectation for compensation for the idea itself.\",\n",
      "      \"B\": \"The court will rule that there was an implied in fact contract between the parties based on the conduct that they manifested.\",\n",
      "      \"C\": \"The court will rule that there was no consideration specifically mentioned and under those circumstances, the law viewed the synopsis as a gift to the producer.\",\n",
      "      \"D\": \"The court will rule that there is an express contract because there was a meeting of the mind and mutual assent to the basic terms.\",\n",
      "      \"E\": \"The court will rule that there was a unilateral contract, with the producer obligated to pay only if he used the idea.\",\n",
      "      \"F\": \"The court will rule that the writer's case is invalid as there was no written agreement.\",\n",
      "      \"G\": \"The court will rule that the producer's assistant lacked the authority to form a contract, and thus, no contract exists.\",\n",
      "      \"H\": \"There were too many terms left out for this to be a contract, and it was too indefinite for the court to imply what the terms might have been.\",\n",
      "      \"I\": \"The court will rule that the phone call between the writer and the producer's assistant formed a verbal contract.\",\n",
      "      \"J\": null\n",
      "    },\n",
      "    \"answer\": \"B\",\n",
      "    \"reasoning\": \"\",\n",
      "    \"input_keys\": null\n",
      "  },\n",
      "  {\n",
      "    \"question\": \"A test charge q C, moving with a velocityv= (i_x +i_y) m/sec, experiences no force in a region of electric and magnetic fields. If the magnetic flux density B= (i_x - 2i_z)Wb/m^2, findE.\",\n",
      "    \"options\": {\n",
      "      \"A\": \"(i_z - 2i_y) volts/m\",\n",
      "      \"B\": \"(-i_x + i_y) volts/m\",\n",
      "      \"C\": \"(3i_x - i_y - 2i_z) volts/m\",\n",
      "      \"D\": \"(2ix - 2i_y +i_z) volts/m\",\n",
      "      \"E\": \"(i_x - 2i_z) volts/m\",\n",
      "      \"F\": \"(2i_x - i_y) volts/m\",\n",
      "      \"G\": \"(i_x +i_y) volts/m\",\n",
      "      \"H\": \"(2ix + 2i_y -i_z) volts/m\",\n",
      "      \"I\": \"(i_y + i_z) volts/m\",\n",
      "      \"J\": \"(i_x + 2i_y + i_z) volts/m\"\n",
      "    },\n",
      "    \"answer\": \"D\",\n",
      "    \"reasoning\": \"\",\n",
      "    \"input_keys\": null\n",
      "  }\n",
      "]\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "\n",
    "def example_to_dict(example):\n",
    "    return {\n",
    "        'question': example.question,\n",
    "        'options': example.options,\n",
    "        'answer': example.answer,\n",
    "        'reasoning': example.reasoning,\n",
    "        'input_keys': list(example.input_keys) if hasattr(example, 'input_keys') else None\n",
    "    }\n",
    "\n",
    "examples_json = [example_to_dict(example) for example in optimized_program.demos]\n",
    "print(\"BEST EXAMPLES:\\n\", json.dumps(examples_json, indent=2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average Metric: 37.00 / 40 (92.5%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:00<00:00, 2826.92it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/29 23:03:16 INFO dspy.evaluate.evaluate: Average Metric: 37 / 40 (92.5%)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>question</th>\n",
       "      <th>options</th>\n",
       "      <th>example_answer</th>\n",
       "      <th>example_reasoning</th>\n",
       "      <th>pred_reasoning</th>\n",
       "      <th>pred_answer</th>\n",
       "      <th>metric</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>For which of these two scenarios does the main character (who uses...</td>\n",
       "      <td>{'A': 'Wrong, Wrong', 'B': 'Wrong, Not wrong', 'C': 'Not wrong, Wr...</td>\n",
       "      <td>A</td>\n",
       "      <td>## Step 1: Evaluate the first scenario In the first scenario, the ...</td>\n",
       "      <td>To determine which scenario involves the main character doing some...</td>\n",
       "      <td>A</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Two lists, list1 and list2, contain the names of books found in tw...</td>\n",
       "      <td>{'A': 'newList ← Combine (listl, list2)\\n newList ← Sort (newList)...</td>\n",
       "      <td>A</td>\n",
       "      <td>To create newList, which contains the names of all books found in ...</td>\n",
       "      <td>To create newList, which contains the names of all books found in ...</td>\n",
       "      <td>A</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>How might the recent global economic crisis be viewed as a challen...</td>\n",
       "      <td>{'A': 'Governmental cyberspace restrictions, in the form of censor...</td>\n",
       "      <td>C</td>\n",
       "      <td>## Step 1: Understanding the liberalist perspective The liberalist...</td>\n",
       "      <td>The liberalist perspective emphasizes the importance of free marke...</td>\n",
       "      <td>C</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>What term is used to represent unavoidable past costs that cannot ...</td>\n",
       "      <td>{'A': 'Prime costs', 'B': 'Sunk costs', 'C': 'Opportunity costs', ...</td>\n",
       "      <td>B</td>\n",
       "      <td>## Step 1: Understanding the concept of sunk costs Sunk costs are ...</td>\n",
       "      <td>To answer this question, we need to understand the concept of each...</td>\n",
       "      <td>B</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Markson Co. traded a concrete-mixing truck with a book value of $1...</td>\n",
       "      <td>{'A': 'Does the book value of the asset given up exceed the fair v...</td>\n",
       "      <td>C</td>\n",
       "      <td>## Step 1: Understand the concept of commercial substance in asset...</td>\n",
       "      <td>To determine whether an exchange of assets has commercial substanc...</td>\n",
       "      <td>C</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Which of the following substances is found in greater quantity in ...</td>\n",
       "      <td>{'A': 'Carbon dioxide', 'B': 'Carbon monoxide', 'C': 'Nitrogen', '...</td>\n",
       "      <td>A</td>\n",
       "      <td>## Step 1: Understanding the context of the question The question ...</td>\n",
       "      <td>To determine which substance is found in greater quantity in exhal...</td>\n",
       "      <td>A</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>For which of these two scenarios does the main character (who uses...</td>\n",
       "      <td>{'A': 'Wrong, Wrong', 'B': 'Wrong, Not wrong', 'C': 'Not wrong, Wr...</td>\n",
       "      <td>D</td>\n",
       "      <td>## Step 1: Analyzing Scenario 1 In Scenario 1, the main character ...</td>\n",
       "      <td>To determine which scenario involves the main character doing some...</td>\n",
       "      <td>D</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>For which of these two scenarios does the main character (who uses...</td>\n",
       "      <td>{'A': 'Wrong, Wrong', 'B': 'Wrong, Not wrong', 'C': 'Not wrong, Wr...</td>\n",
       "      <td>C</td>\n",
       "      <td>## Step 1: Evaluate the first scenario In the first scenario, the ...</td>\n",
       "      <td>To determine which scenario involves the main character doing some...</td>\n",
       "      <td>C</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>In the inherited disorder called Pompe disease, glycogen breakdown...</td>\n",
       "      <td>{'A': 'enzymes in the lysosomes', 'B': 'enzymes in the mitochondri...</td>\n",
       "      <td>A</td>\n",
       "      <td>## Step 1: Understanding Pompe Disease Pompe disease is an inherit...</td>\n",
       "      <td>Pompe disease is characterized by the accumulation of glycogen in ...</td>\n",
       "      <td>A</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>A state legislature has recently enacted an obscenity statute proh...</td>\n",
       "      <td>{'A': 'The particular materials involved depicted normal, not devi...</td>\n",
       "      <td>B</td>\n",
       "      <td>To determine which of the given options would be most helpful to t...</td>\n",
       "      <td>To determine which of the given options would be most helpful to t...</td>\n",
       "      <td>B</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>A 29-year-old woman comes to the emergency department because she ...</td>\n",
       "      <td>{'A': 'Cervical culture', 'B': 'Culdocentesis', 'C': 'Laparoscopy'...</td>\n",
       "      <td>C</td>\n",
       "      <td>## Step 1: Consider the patient's symptoms The patient presents wi...</td>\n",
       "      <td>The patient presents with symptoms of lower abdominal pain, nausea...</td>\n",
       "      <td>C</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>A person who experiences flashbacks and nightmares after being inv...</td>\n",
       "      <td>{'A': 'dissociative identity disorder', 'B': 'bipolar disorder', '...</td>\n",
       "      <td>D</td>\n",
       "      <td>## Step 1: Identify the key symptoms described in the question. Th...</td>\n",
       "      <td>To answer this question, we need to consider the symptoms describe...</td>\n",
       "      <td>D</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>A scientist studied the migration patterns of two types of whales....</td>\n",
       "      <td>{'A': '128', 'B': '192', 'C': '280', 'D': '408'}</td>\n",
       "      <td>B</td>\n",
       "      <td>## Step 1: Calculate the rate of the humpback whales To find the r...</td>\n",
       "      <td>To solve this problem, we first need to find the rate at which the...</td>\n",
       "      <td>B</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>Select the best translation into predicate logic: Not all lobster...</td>\n",
       "      <td>{'A': '~(∀x)(Lx ⊃ Rx)', 'B': '~(∃x)(Lx ⊃ Rx)', 'C': '~(∃x)(Lx • Rx...</td>\n",
       "      <td>A</td>\n",
       "      <td>## Step 1: Understanding the statement The statement \"Not all lobs...</td>\n",
       "      <td>To translate the statement \"Not all lobsters are red\" into predica...</td>\n",
       "      <td>A</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>Which of the following appears to lower bad cholesterol?</td>\n",
       "      <td>{'A': 'Vitamin D', 'B': 'Niacin', 'C': 'Thiamine', 'D': 'Riboflavin'}</td>\n",
       "      <td>B</td>\n",
       "      <td>## Step 1: Understanding the question The question asks which of t...</td>\n",
       "      <td>To answer this question, we need to consider the effects of each o...</td>\n",
       "      <td>B</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>One Friday afternoon, just before leaving on a weekend tip, a psyc...</td>\n",
       "      <td>{'A': \"unethical, because the client's needs were not addressed mo...</td>\n",
       "      <td>A</td>\n",
       "      <td>## Step 1: Identify the key elements of the situation The psycholo...</td>\n",
       "      <td>The psychologist's behavior in this situation is unethical because...</td>\n",
       "      <td>A</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>For which of these two scenarios does the main character (who uses...</td>\n",
       "      <td>{'A': 'Wrong, Wrong', 'B': 'Wrong, Not wrong', 'C': 'Not wrong, Wr...</td>\n",
       "      <td>A</td>\n",
       "      <td>## Step 1: Analyzing Scenario 1 In Scenario 1, the main character ...</td>\n",
       "      <td>To determine which scenario involves the main character doing some...</td>\n",
       "      <td>A</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>When was the major shift by Greek philosophers who rejected the a...</td>\n",
       "      <td>{'A': 'Early Third Century BCE', 'B': 'Second and First Century BC...</td>\n",
       "      <td>D</td>\n",
       "      <td>## Step 1: Identify the time period of the major shift in Greek ph...</td>\n",
       "      <td>The major shift by Greek philosophers who rejected the anthropomor...</td>\n",
       "      <td>C</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>What does Marquis think is the main reason why it is wrong to kil...</td>\n",
       "      <td>{'A': 'The fetus can feel pain.', 'B': 'Abortion will harm the wom...</td>\n",
       "      <td>C</td>\n",
       "      <td>## Step 1: Understand the question The question asks for Marquis' ...</td>\n",
       "      <td>To answer this question, we need to consider the philosophical arg...</td>\n",
       "      <td>C</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>For which of these two scenarios does the main character (who uses...</td>\n",
       "      <td>{'A': 'Wrong, Wrong', 'B': 'Wrong, Not wrong', 'C': 'Not wrong, Wr...</td>\n",
       "      <td>B</td>\n",
       "      <td>## Step 1: Evaluate Scenario 1 In Scenario 1, the main character p...</td>\n",
       "      <td>To determine which scenario involves the main character doing some...</td>\n",
       "      <td>B</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>The _post hoc_ fallacy is</td>\n",
       "      <td>{'A': 'arguing that a single event caused another when the cause i...</td>\n",
       "      <td>D</td>\n",
       "      <td>## Step 1: Understanding the _post hoc_ fallacy The _post hoc_ fal...</td>\n",
       "      <td>The _post hoc_ fallacy, which is Latin for \"after this, therefore ...</td>\n",
       "      <td>D</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>Statement 1| We learn a classifier f by boosting weak learners h. ...</td>\n",
       "      <td>{'A': 'True, True', 'B': 'False, False', 'C': 'True, False', 'D': ...</td>\n",
       "      <td>D</td>\n",
       "      <td>## Step 1: Analyzing Statement 1 Statement 1 claims that the funct...</td>\n",
       "      <td>To address this question, let's analyze each statement individuall...</td>\n",
       "      <td>D</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>Which of the following is not a recognised type of plan?</td>\n",
       "      <td>{'A': 'Business', 'B': 'Succession', 'C': 'Ad hoc', 'D': 'Financial'}</td>\n",
       "      <td>C</td>\n",
       "      <td>## Step 1: Identify the types of plans There are several types of ...</td>\n",
       "      <td>To determine which of the following is not a recognised type of pl...</td>\n",
       "      <td>C</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>Why is market segmentation carried out?</td>\n",
       "      <td>{'A': 'To break down large markets into smaller markets.', 'B': 'T...</td>\n",
       "      <td>C</td>\n",
       "      <td>To answer this question, let's consider the purpose and benefits o...</td>\n",
       "      <td>Market segmentation is a crucial strategy in marketing that involv...</td>\n",
       "      <td>C</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>Although corporate profit maximization is not directly prescribed ...</td>\n",
       "      <td>{'A': 'hedonistic utilitarianism', 'B': 'preference utilitarianism...</td>\n",
       "      <td>B</td>\n",
       "      <td>To determine which normative theory corporate profit maximization ...</td>\n",
       "      <td>To determine which normative theory corporate profit maximization ...</td>\n",
       "      <td>B</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>A state has passed a law that provides that only residents of the ...</td>\n",
       "      <td>{'A': 'The contract clause prohibition against a state from enacti...</td>\n",
       "      <td>C</td>\n",
       "      <td>## Step 1: Understanding the issue The state law in question restr...</td>\n",
       "      <td>The best constitutional argument to contest the validity of the st...</td>\n",
       "      <td>C</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>A company incorporated and headquartered in State A sued a plumber...</td>\n",
       "      <td>{'A': \"No, because the court could fix the amount of damages even ...</td>\n",
       "      <td>B</td>\n",
       "      <td>## Step 1: Understanding the procedural context The plumber filed ...</td>\n",
       "      <td>To determine whether the court is likely to grant the plumber's mo...</td>\n",
       "      <td>C</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>Select the best translation into predicate logic. Some animals are...</td>\n",
       "      <td>{'A': '(∃x)[(Ax • Cx) • (∃y)(Py • Nyx)]', 'B': '(∃x)[(Cx • Px) • (...</td>\n",
       "      <td>B</td>\n",
       "      <td>To translate the given sentence \"Some animals are neglected by cru...</td>\n",
       "      <td>To translate the given statement \"Some animals are neglected by cr...</td>\n",
       "      <td>B</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>African lungfish often live in small, stagnant, freshwater pools. ...</td>\n",
       "      <td>{'A': 'Urea is insoluble in water and sinks to the bottom of the s...</td>\n",
       "      <td>B</td>\n",
       "      <td>## Step 1: Understanding the context of the problem The problem de...</td>\n",
       "      <td>To solve this question, let's analyze each option given and relate...</td>\n",
       "      <td>B</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>Which of the following is a function of biotin?\\n</td>\n",
       "      <td>{'A': 'Decarboxylation of amino acids to form amine neurotransmitt...</td>\n",
       "      <td>C</td>\n",
       "      <td>## Step 1: Understanding Biotin's Role Biotin is a B-vitamin that ...</td>\n",
       "      <td>To answer this question, we need to understand the role of biotin ...</td>\n",
       "      <td>C</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>How many human polyomaviruses are known at present?</td>\n",
       "      <td>{'A': '100', 'B': '1', 'C': '10', 'D': 'unknown'}</td>\n",
       "      <td>C</td>\n",
       "      <td>## Step 1: Identify the question The question asks for the number ...</td>\n",
       "      <td>To answer this question, we need to consider the current state of ...</td>\n",
       "      <td>C</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>A female employee was fired as an executive assistant at a time wh...</td>\n",
       "      <td>{'A': 'No, because the statement is clearly hearsay with no except...</td>\n",
       "      <td>C</td>\n",
       "      <td>## Step 1: Determine the nature of the statement in question. The ...</td>\n",
       "      <td>The statement made by the human relations manager, as recounted by...</td>\n",
       "      <td>C</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>For which of these two scenarios does the main character (who uses...</td>\n",
       "      <td>{'A': 'Wrong, Wrong', 'B': 'Wrong, Not wrong', 'C': 'Not wrong, Wr...</td>\n",
       "      <td>D</td>\n",
       "      <td>## Step 1: Analyzing Scenario 1 In Scenario 1, the main character ...</td>\n",
       "      <td>To determine which scenario involves the main character doing some...</td>\n",
       "      <td>D</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>How old is the Earth approximately?</td>\n",
       "      <td>{'A': '50,000 years', 'B': '300 million years', 'C': '4.5 billion ...</td>\n",
       "      <td>C</td>\n",
       "      <td>## Step 1: Understanding the question The question asks for the ap...</td>\n",
       "      <td>To determine the approximate age of the Earth, we must consider th...</td>\n",
       "      <td>C</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>Which of these is not a type of primate?</td>\n",
       "      <td>{'A': 'baboon', 'B': 'marmot', 'C': 'orangutan', 'D': 'chimpanzee'}</td>\n",
       "      <td>B</td>\n",
       "      <td>## Step 1: Identify the types of primates First, we need to know w...</td>\n",
       "      <td>To determine which of these is not a type of primate, we need to i...</td>\n",
       "      <td>B</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>There are different forms of the ANOVA (FACTORIAL ANOVA, MANOVA, A...</td>\n",
       "      <td>{'A': 'when there are 2 or more IVs', 'B': 'when study has 2 or mo...</td>\n",
       "      <td>D</td>\n",
       "      <td>## Step 1: Understanding the different forms of ANOVA There are se...</td>\n",
       "      <td>To determine when the ANCOVA (not ACOVA) form is used, we need to ...</td>\n",
       "      <td>D</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>A ten-year-old boy, tested with a paper-and-pencil intelligence te...</td>\n",
       "      <td>{'A': 'more than 130', 'B': 'less than 130', 'C': '130', 'D': '130...</td>\n",
       "      <td>B</td>\n",
       "      <td>## Step 1: Understanding the Concept of IQ Scores IQ scores are de...</td>\n",
       "      <td>The question involves understanding how IQ scores work, especially...</td>\n",
       "      <td>C</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>In the context of globalization, how can we best define the term '...</td>\n",
       "      <td>{'A': 'The term development refers to the human condition in which...</td>\n",
       "      <td>C</td>\n",
       "      <td>To determine the best definition of 'development' in the context o...</td>\n",
       "      <td>To define the term 'development' in the context of globalization, ...</td>\n",
       "      <td>C</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>If someone argues that something should be done in a certain way b...</td>\n",
       "      <td>{'A': 'slippery slope', 'B': 'appeal to tradition', 'C': 'argument...</td>\n",
       "      <td>B</td>\n",
       "      <td>## Step 1: Identify the type of fallacy described in the question....</td>\n",
       "      <td>The argument presented is based on the premise that a certain meth...</td>\n",
       "      <td>B</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39</th>\n",
       "      <td>This is part of the communication process where receivers unpack t...</td>\n",
       "      <td>{'A': 'Encoding.', 'B': 'Decoding.', 'C': 'Transfer.', 'D': 'Noise.'}</td>\n",
       "      <td>B</td>\n",
       "      <td>## Step 1: Understanding the communication process The communicati...</td>\n",
       "      <td>To answer this question, let's break down the communication proces...</td>\n",
       "      <td>B</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                                 question  \\\n",
       "0   For which of these two scenarios does the main character (who uses...   \n",
       "1   Two lists, list1 and list2, contain the names of books found in tw...   \n",
       "2   How might the recent global economic crisis be viewed as a challen...   \n",
       "3   What term is used to represent unavoidable past costs that cannot ...   \n",
       "4   Markson Co. traded a concrete-mixing truck with a book value of $1...   \n",
       "5   Which of the following substances is found in greater quantity in ...   \n",
       "6   For which of these two scenarios does the main character (who uses...   \n",
       "7   For which of these two scenarios does the main character (who uses...   \n",
       "8   In the inherited disorder called Pompe disease, glycogen breakdown...   \n",
       "9   A state legislature has recently enacted an obscenity statute proh...   \n",
       "10  A 29-year-old woman comes to the emergency department because she ...   \n",
       "11  A person who experiences flashbacks and nightmares after being inv...   \n",
       "12  A scientist studied the migration patterns of two types of whales....   \n",
       "13   Select the best translation into predicate logic: Not all lobster...   \n",
       "14               Which of the following appears to lower bad cholesterol?   \n",
       "15  One Friday afternoon, just before leaving on a weekend tip, a psyc...   \n",
       "16  For which of these two scenarios does the main character (who uses...   \n",
       "17   When was the major shift by Greek philosophers who rejected the a...   \n",
       "18   What does Marquis think is the main reason why it is wrong to kil...   \n",
       "19  For which of these two scenarios does the main character (who uses...   \n",
       "20                                              The _post hoc_ fallacy is   \n",
       "21  Statement 1| We learn a classifier f by boosting weak learners h. ...   \n",
       "22               Which of the following is not a recognised type of plan?   \n",
       "23                                Why is market segmentation carried out?   \n",
       "24  Although corporate profit maximization is not directly prescribed ...   \n",
       "25  A state has passed a law that provides that only residents of the ...   \n",
       "26  A company incorporated and headquartered in State A sued a plumber...   \n",
       "27  Select the best translation into predicate logic. Some animals are...   \n",
       "28  African lungfish often live in small, stagnant, freshwater pools. ...   \n",
       "29                      Which of the following is a function of biotin?\\n   \n",
       "30                    How many human polyomaviruses are known at present?   \n",
       "31  A female employee was fired as an executive assistant at a time wh...   \n",
       "32  For which of these two scenarios does the main character (who uses...   \n",
       "33                                    How old is the Earth approximately?   \n",
       "34                               Which of these is not a type of primate?   \n",
       "35  There are different forms of the ANOVA (FACTORIAL ANOVA, MANOVA, A...   \n",
       "36  A ten-year-old boy, tested with a paper-and-pencil intelligence te...   \n",
       "37  In the context of globalization, how can we best define the term '...   \n",
       "38  If someone argues that something should be done in a certain way b...   \n",
       "39  This is part of the communication process where receivers unpack t...   \n",
       "\n",
       "                                                                  options  \\\n",
       "0   {'A': 'Wrong, Wrong', 'B': 'Wrong, Not wrong', 'C': 'Not wrong, Wr...   \n",
       "1   {'A': 'newList ← Combine (listl, list2)\\n newList ← Sort (newList)...   \n",
       "2   {'A': 'Governmental cyberspace restrictions, in the form of censor...   \n",
       "3   {'A': 'Prime costs', 'B': 'Sunk costs', 'C': 'Opportunity costs', ...   \n",
       "4   {'A': 'Does the book value of the asset given up exceed the fair v...   \n",
       "5   {'A': 'Carbon dioxide', 'B': 'Carbon monoxide', 'C': 'Nitrogen', '...   \n",
       "6   {'A': 'Wrong, Wrong', 'B': 'Wrong, Not wrong', 'C': 'Not wrong, Wr...   \n",
       "7   {'A': 'Wrong, Wrong', 'B': 'Wrong, Not wrong', 'C': 'Not wrong, Wr...   \n",
       "8   {'A': 'enzymes in the lysosomes', 'B': 'enzymes in the mitochondri...   \n",
       "9   {'A': 'The particular materials involved depicted normal, not devi...   \n",
       "10  {'A': 'Cervical culture', 'B': 'Culdocentesis', 'C': 'Laparoscopy'...   \n",
       "11  {'A': 'dissociative identity disorder', 'B': 'bipolar disorder', '...   \n",
       "12                       {'A': '128', 'B': '192', 'C': '280', 'D': '408'}   \n",
       "13  {'A': '~(∀x)(Lx ⊃ Rx)', 'B': '~(∃x)(Lx ⊃ Rx)', 'C': '~(∃x)(Lx • Rx...   \n",
       "14  {'A': 'Vitamin D', 'B': 'Niacin', 'C': 'Thiamine', 'D': 'Riboflavin'}   \n",
       "15  {'A': \"unethical, because the client's needs were not addressed mo...   \n",
       "16  {'A': 'Wrong, Wrong', 'B': 'Wrong, Not wrong', 'C': 'Not wrong, Wr...   \n",
       "17  {'A': 'Early Third Century BCE', 'B': 'Second and First Century BC...   \n",
       "18  {'A': 'The fetus can feel pain.', 'B': 'Abortion will harm the wom...   \n",
       "19  {'A': 'Wrong, Wrong', 'B': 'Wrong, Not wrong', 'C': 'Not wrong, Wr...   \n",
       "20  {'A': 'arguing that a single event caused another when the cause i...   \n",
       "21  {'A': 'True, True', 'B': 'False, False', 'C': 'True, False', 'D': ...   \n",
       "22  {'A': 'Business', 'B': 'Succession', 'C': 'Ad hoc', 'D': 'Financial'}   \n",
       "23  {'A': 'To break down large markets into smaller markets.', 'B': 'T...   \n",
       "24  {'A': 'hedonistic utilitarianism', 'B': 'preference utilitarianism...   \n",
       "25  {'A': 'The contract clause prohibition against a state from enacti...   \n",
       "26  {'A': \"No, because the court could fix the amount of damages even ...   \n",
       "27  {'A': '(∃x)[(Ax • Cx) • (∃y)(Py • Nyx)]', 'B': '(∃x)[(Cx • Px) • (...   \n",
       "28  {'A': 'Urea is insoluble in water and sinks to the bottom of the s...   \n",
       "29  {'A': 'Decarboxylation of amino acids to form amine neurotransmitt...   \n",
       "30                      {'A': '100', 'B': '1', 'C': '10', 'D': 'unknown'}   \n",
       "31  {'A': 'No, because the statement is clearly hearsay with no except...   \n",
       "32  {'A': 'Wrong, Wrong', 'B': 'Wrong, Not wrong', 'C': 'Not wrong, Wr...   \n",
       "33  {'A': '50,000 years', 'B': '300 million years', 'C': '4.5 billion ...   \n",
       "34    {'A': 'baboon', 'B': 'marmot', 'C': 'orangutan', 'D': 'chimpanzee'}   \n",
       "35  {'A': 'when there are 2 or more IVs', 'B': 'when study has 2 or mo...   \n",
       "36  {'A': 'more than 130', 'B': 'less than 130', 'C': '130', 'D': '130...   \n",
       "37  {'A': 'The term development refers to the human condition in which...   \n",
       "38  {'A': 'slippery slope', 'B': 'appeal to tradition', 'C': 'argument...   \n",
       "39  {'A': 'Encoding.', 'B': 'Decoding.', 'C': 'Transfer.', 'D': 'Noise.'}   \n",
       "\n",
       "   example_answer  \\\n",
       "0               A   \n",
       "1               A   \n",
       "2               C   \n",
       "3               B   \n",
       "4               C   \n",
       "5               A   \n",
       "6               D   \n",
       "7               C   \n",
       "8               A   \n",
       "9               B   \n",
       "10              C   \n",
       "11              D   \n",
       "12              B   \n",
       "13              A   \n",
       "14              B   \n",
       "15              A   \n",
       "16              A   \n",
       "17              D   \n",
       "18              C   \n",
       "19              B   \n",
       "20              D   \n",
       "21              D   \n",
       "22              C   \n",
       "23              C   \n",
       "24              B   \n",
       "25              C   \n",
       "26              B   \n",
       "27              B   \n",
       "28              B   \n",
       "29              C   \n",
       "30              C   \n",
       "31              C   \n",
       "32              D   \n",
       "33              C   \n",
       "34              B   \n",
       "35              D   \n",
       "36              B   \n",
       "37              C   \n",
       "38              B   \n",
       "39              B   \n",
       "\n",
       "                                                        example_reasoning  \\\n",
       "0   ## Step 1: Evaluate the first scenario In the first scenario, the ...   \n",
       "1   To create newList, which contains the names of all books found in ...   \n",
       "2   ## Step 1: Understanding the liberalist perspective The liberalist...   \n",
       "3   ## Step 1: Understanding the concept of sunk costs Sunk costs are ...   \n",
       "4   ## Step 1: Understand the concept of commercial substance in asset...   \n",
       "5   ## Step 1: Understanding the context of the question The question ...   \n",
       "6   ## Step 1: Analyzing Scenario 1 In Scenario 1, the main character ...   \n",
       "7   ## Step 1: Evaluate the first scenario In the first scenario, the ...   \n",
       "8   ## Step 1: Understanding Pompe Disease Pompe disease is an inherit...   \n",
       "9   To determine which of the given options would be most helpful to t...   \n",
       "10  ## Step 1: Consider the patient's symptoms The patient presents wi...   \n",
       "11  ## Step 1: Identify the key symptoms described in the question. Th...   \n",
       "12  ## Step 1: Calculate the rate of the humpback whales To find the r...   \n",
       "13  ## Step 1: Understanding the statement The statement \"Not all lobs...   \n",
       "14  ## Step 1: Understanding the question The question asks which of t...   \n",
       "15  ## Step 1: Identify the key elements of the situation The psycholo...   \n",
       "16  ## Step 1: Analyzing Scenario 1 In Scenario 1, the main character ...   \n",
       "17  ## Step 1: Identify the time period of the major shift in Greek ph...   \n",
       "18  ## Step 1: Understand the question The question asks for Marquis' ...   \n",
       "19  ## Step 1: Evaluate Scenario 1 In Scenario 1, the main character p...   \n",
       "20  ## Step 1: Understanding the _post hoc_ fallacy The _post hoc_ fal...   \n",
       "21  ## Step 1: Analyzing Statement 1 Statement 1 claims that the funct...   \n",
       "22  ## Step 1: Identify the types of plans There are several types of ...   \n",
       "23  To answer this question, let's consider the purpose and benefits o...   \n",
       "24  To determine which normative theory corporate profit maximization ...   \n",
       "25  ## Step 1: Understanding the issue The state law in question restr...   \n",
       "26  ## Step 1: Understanding the procedural context The plumber filed ...   \n",
       "27  To translate the given sentence \"Some animals are neglected by cru...   \n",
       "28  ## Step 1: Understanding the context of the problem The problem de...   \n",
       "29  ## Step 1: Understanding Biotin's Role Biotin is a B-vitamin that ...   \n",
       "30  ## Step 1: Identify the question The question asks for the number ...   \n",
       "31  ## Step 1: Determine the nature of the statement in question. The ...   \n",
       "32  ## Step 1: Analyzing Scenario 1 In Scenario 1, the main character ...   \n",
       "33  ## Step 1: Understanding the question The question asks for the ap...   \n",
       "34  ## Step 1: Identify the types of primates First, we need to know w...   \n",
       "35  ## Step 1: Understanding the different forms of ANOVA There are se...   \n",
       "36  ## Step 1: Understanding the Concept of IQ Scores IQ scores are de...   \n",
       "37  To determine the best definition of 'development' in the context o...   \n",
       "38  ## Step 1: Identify the type of fallacy described in the question....   \n",
       "39  ## Step 1: Understanding the communication process The communicati...   \n",
       "\n",
       "                                                           pred_reasoning  \\\n",
       "0   To determine which scenario involves the main character doing some...   \n",
       "1   To create newList, which contains the names of all books found in ...   \n",
       "2   The liberalist perspective emphasizes the importance of free marke...   \n",
       "3   To answer this question, we need to understand the concept of each...   \n",
       "4   To determine whether an exchange of assets has commercial substanc...   \n",
       "5   To determine which substance is found in greater quantity in exhal...   \n",
       "6   To determine which scenario involves the main character doing some...   \n",
       "7   To determine which scenario involves the main character doing some...   \n",
       "8   Pompe disease is characterized by the accumulation of glycogen in ...   \n",
       "9   To determine which of the given options would be most helpful to t...   \n",
       "10  The patient presents with symptoms of lower abdominal pain, nausea...   \n",
       "11  To answer this question, we need to consider the symptoms describe...   \n",
       "12  To solve this problem, we first need to find the rate at which the...   \n",
       "13  To translate the statement \"Not all lobsters are red\" into predica...   \n",
       "14  To answer this question, we need to consider the effects of each o...   \n",
       "15  The psychologist's behavior in this situation is unethical because...   \n",
       "16  To determine which scenario involves the main character doing some...   \n",
       "17  The major shift by Greek philosophers who rejected the anthropomor...   \n",
       "18  To answer this question, we need to consider the philosophical arg...   \n",
       "19  To determine which scenario involves the main character doing some...   \n",
       "20  The _post hoc_ fallacy, which is Latin for \"after this, therefore ...   \n",
       "21  To address this question, let's analyze each statement individuall...   \n",
       "22  To determine which of the following is not a recognised type of pl...   \n",
       "23  Market segmentation is a crucial strategy in marketing that involv...   \n",
       "24  To determine which normative theory corporate profit maximization ...   \n",
       "25  The best constitutional argument to contest the validity of the st...   \n",
       "26  To determine whether the court is likely to grant the plumber's mo...   \n",
       "27  To translate the given statement \"Some animals are neglected by cr...   \n",
       "28  To solve this question, let's analyze each option given and relate...   \n",
       "29  To answer this question, we need to understand the role of biotin ...   \n",
       "30  To answer this question, we need to consider the current state of ...   \n",
       "31  The statement made by the human relations manager, as recounted by...   \n",
       "32  To determine which scenario involves the main character doing some...   \n",
       "33  To determine the approximate age of the Earth, we must consider th...   \n",
       "34  To determine which of these is not a type of primate, we need to i...   \n",
       "35  To determine when the ANCOVA (not ACOVA) form is used, we need to ...   \n",
       "36  The question involves understanding how IQ scores work, especially...   \n",
       "37  To define the term 'development' in the context of globalization, ...   \n",
       "38  The argument presented is based on the premise that a certain meth...   \n",
       "39  To answer this question, let's break down the communication proces...   \n",
       "\n",
       "   pred_answer     metric  \n",
       "0            A  ✔️ [True]  \n",
       "1            A  ✔️ [True]  \n",
       "2            C  ✔️ [True]  \n",
       "3            B  ✔️ [True]  \n",
       "4            C  ✔️ [True]  \n",
       "5            A  ✔️ [True]  \n",
       "6            D  ✔️ [True]  \n",
       "7            C  ✔️ [True]  \n",
       "8            A  ✔️ [True]  \n",
       "9            B  ✔️ [True]  \n",
       "10           C  ✔️ [True]  \n",
       "11           D  ✔️ [True]  \n",
       "12           B  ✔️ [True]  \n",
       "13           A  ✔️ [True]  \n",
       "14           B  ✔️ [True]  \n",
       "15           A  ✔️ [True]  \n",
       "16           A  ✔️ [True]  \n",
       "17           C             \n",
       "18           C  ✔️ [True]  \n",
       "19           B  ✔️ [True]  \n",
       "20           D  ✔️ [True]  \n",
       "21           D  ✔️ [True]  \n",
       "22           C  ✔️ [True]  \n",
       "23           C  ✔️ [True]  \n",
       "24           B  ✔️ [True]  \n",
       "25           C  ✔️ [True]  \n",
       "26           C             \n",
       "27           B  ✔️ [True]  \n",
       "28           B  ✔️ [True]  \n",
       "29           C  ✔️ [True]  \n",
       "30           C  ✔️ [True]  \n",
       "31           C  ✔️ [True]  \n",
       "32           D  ✔️ [True]  \n",
       "33           C  ✔️ [True]  \n",
       "34           B  ✔️ [True]  \n",
       "35           D  ✔️ [True]  \n",
       "36           C             \n",
       "37           C  ✔️ [True]  \n",
       "38           B  ✔️ [True]  \n",
       "39           B  ✔️ [True]  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 63.8 ms, sys: 8.1 ms, total: 71.9 ms\n",
      "Wall time: 66.7 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "score, results, all_scores = evaluate(\n",
    "    optimized_program,\n",
    "    devset=testset[:40],\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Medium Optimization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/29 23:05:39 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
      "RUNNING WITH THE FOLLOWING MEDIUM AUTO RUN SETTINGS:\n",
      "num_trials: 25\n",
      "minibatch: True\n",
      "num_candidates: 25\n",
      "valset size: 300\n",
      "\n",
      "2025/01/29 23:05:39 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
      "==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==\n",
      "2025/01/29 23:05:39 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used for informing instruction proposal.\n",
      "\n",
      "2025/01/29 23:05:39 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=25 sets of demonstrations...\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapping set 1/25\n",
      "Bootstrapping set 2/25\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|▉                                                                                                                                                                                                                                                 | 2/500 [00:08<35:10,  4.24s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.\n",
      "Bootstrapping set 3/25\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  1%|█▍                                                                                                                                                                                                                                              | 3/500 [00:00<00:00, 1029.78it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.\n",
      "Bootstrapping set 4/25\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|▉                                                                                                                                                                                                                                                 | 2/500 [00:09<40:22,  4.86s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 1 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.\n",
      "Bootstrapping set 5/25\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  1%|█▍                                                                                                                                                                                                                                              | 3/500 [00:39<1:49:09, 13.18s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 2 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.\n",
      "Bootstrapping set 6/25\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|▍                                                                                                                                                                                                                                               | 1/500 [00:07<1:03:02,  7.58s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.\n",
      "Bootstrapping set 7/25\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|▍                                                                                                                                                                                                                                                 | 1/500 [00:05<46:14,  5.56s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.\n",
      "Bootstrapping set 8/25\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|▍                                                                                                                                                                                                                                                 | 1/500 [00:03<28:32,  3.43s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.\n",
      "Bootstrapping set 9/25\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  1%|█▍                                                                                                                                                                                                                                                | 3/500 [00:20<56:43,  6.85s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.\n",
      "Bootstrapping set 10/25\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  1%|█▍                                                                                                                                                                                                                                              | 3/500 [00:25<1:10:33,  8.52s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.\n",
      "Bootstrapping set 11/25\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|▉                                                                                                                                                                                                                                                 | 2/500 [00:09<38:41,  4.66s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.\n",
      "Bootstrapping set 12/25\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|▉                                                                                                                                                                                                                                                 | 2/500 [00:11<47:16,  5.70s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.\n",
      "Bootstrapping set 13/25\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|▍                                                                                                                                                                                                                                               | 1/500 [00:08<1:14:39,  8.98s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.\n",
      "Bootstrapping set 14/25\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|▉                                                                                                                                                                                                                                                 | 2/500 [00:09<39:02,  4.70s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.\n",
      "Bootstrapping set 15/25\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|▉                                                                                                                                                                                                                                                 | 2/500 [00:11<46:20,  5.58s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.\n",
      "Bootstrapping set 16/25\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  1%|█▍                                                                                                                                                                                                                                              | 3/500 [00:23<1:04:49,  7.82s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.\n",
      "Bootstrapping set 17/25\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|▍                                                                                                                                                                                                                                                 | 1/500 [00:06<56:44,  6.82s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.\n",
      "Bootstrapping set 18/25\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  1%|█▍                                                                                                                                                                                                                                                | 3/500 [00:14<40:10,  4.85s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 2 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.\n",
      "Bootstrapping set 19/25\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  1%|█▍                                                                                                                                                                                                                                                | 3/500 [00:19<54:38,  6.60s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.\n",
      "Bootstrapping set 20/25\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|▉                                                                                                                                                                                                                                                 | 2/500 [00:10<42:03,  5.07s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.\n",
      "Bootstrapping set 21/25\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  1%|█▉                                                                                                                                                                                                                                              | 4/500 [00:29<1:01:42,  7.46s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 3 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.\n",
      "Bootstrapping set 22/25\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  1%|█▍                                                                                                                                                                                                                                              | 3/500 [00:22<1:02:47,  7.58s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.\n",
      "Bootstrapping set 23/25\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|▉                                                                                                                                                                                                                                                 | 2/500 [00:14<58:31,  7.05s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.\n",
      "Bootstrapping set 24/25\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|▍                                                                                                                                                                                                                                                 | 1/500 [00:04<39:09,  4.71s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.\n",
      "Bootstrapping set 25/25\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|▍                                                                                                                                                                                                                                               | 1/500 [00:10<1:31:02, 10.95s/it]\n",
      "2025/01/29 23:11:07 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
      "==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==\n",
      "2025/01/29 23:11:07 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/29 23:11:19 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
      "Proposing instructions...\n",
      "\n",
      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:\n",
      "\n",
      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 0: You are a knowledgeable and insightful assistant designed to provide guidance on multiple-choice questions. Your role involves analyzing questions, assessing answer choices, and offering well-reasoned explanations to enhance understanding. By breaking down complex concepts, you help users develop critical thinking skills and improve their decision-making process. You strive to present information in a clear, structured manner while adapting to the user's level of expertise. Ultimately, your goal is to foster deeper comprehension and confidence in tackling multiple-choice assessments.\n",
      "\n",
      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 1: To tackle a multiple-choice question effectively, carefully read and analyze the question stem, identifying key concepts and any specific details that might narrow down the possible answers. Next, assess each answer choice in relation to the question, considering the relevance, accuracy, and implications of each option. Provide a step-by-step breakdown of your reasoning process, ensuring that each step logically follows from the previous one and that you address any complexities or nuances of the question. In your explanation, define key terms, explain relevant concepts, and justify why certain options can be eliminated or why one option stands out as the correct answer. Throughout your response, maintain clarity and organization, using transitional phrases and clear headings to guide the reader through your thought process. Finally, conclude by stating the correct answer and summarizing the main points that led to this conclusion, reinforcing the reasoning and ensuring that the explanation is accessible to readers of various expertise levels.\n",
      "\n",
      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 2: You are a skilled educator and analyst with expertise in multiple disciplines, including law, economics, physics, and general knowledge. Your role is to guide users through complex multiple-choice questions by providing detailed, step-by-step reasoning and explanations. You aim to enhance users' critical thinking skills, decision-making processes, and confidence in tackling assessments across various subjects. When approaching a question, consider the context, analyze each option carefully, and break down the reasoning into clear, structured steps. Adapt your explanations to cater to users with different levels of expertise, ensuring that your responses are informative, engaging, and easy to understand. By doing so, you will not only help users arrive at the correct answers but also foster a deeper understanding of the subject matter and improve their ability to approach similar questions independently.\n",
      "\n",
      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 3: Act as an expert tutor guiding students through complex multiple-choice questions. Analyze each question carefully, considering all possible answer choices and their implications. Generate a clear, step-by-step explanation for why a particular answer is correct, focusing on breaking down difficult concepts into manageable parts. Ensure your reasoning is concise, yet detailed enough to illuminate the thought process behind selecting the correct answer. Remember, the goal is not just to provide the right answer, but to educate and enhance the student's understanding and critical thinking skills. Approach each question with the mindset of teaching a student how to think through the problem, rather than just giving them the solution. By doing so, you will help foster a deeper understanding of the subject matter and improve the student's ability to tackle similar questions independently.\n",
      "\n",
      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 4: Analyze the given multiple-choice question and options, then provide a step-by-step reasoning process to arrive at the correct answer. Ensure the explanation is clear, structured, and adapted to the user's level of expertise, fostering deeper comprehension and confidence in tackling multiple-choice assessments.\n",
      "\n",
      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 5: To tackle multiple-choice questions effectively, analyze the question stem to identify key concepts and issues. Then, evaluate each answer choice by considering its relevance, logical consistency, and alignment with the question's requirements. Break down complex concepts into manageable parts, and assess how each option addresses the question's core concerns. Provide a step-by-step reasoning process that justifies the selection of the correct answer, ensuring that the explanation is clear, concise, and tailored to the user's level of expertise. Ultimately, aim to enhance the user's critical thinking skills and confidence in approaching multiple-choice assessments by offering insightful guidance and fostering a deeper understanding of the subject matter.\n",
      "\n",
      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 6: To provide a well-structured response, analyze the given multiple-choice question and options. Break down the question into key components, and then assess each option against these components. Offer a step-by-step reasoning process that leads to the selection of the correct answer. Ensure that the reasoning is clear, concise, and adapted to the user's level of expertise. The goal is to not only select the correct answer but to also explain why it is correct and why the other options are incorrect, thereby enhancing the user's understanding and critical thinking skills.\n",
      "\n",
      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 7: Act as an expert tutor guiding students through complex multiple-choice questions. When presented with a question and a set of options, carefully analyze the query, evaluate each answer choice, and construct a detailed, step-by-step explanation that leads to the correct answer. Ensure your reasoning is clear, concise, and tailored to the user's level of understanding, promoting a deeper grasp of the subject matter and enhancing their ability to approach similar questions with confidence. Your explanation should not only justify the correct answer but also provide insight into why the other options are incorrect, thereby fostering critical thinking and analytical skills.\n",
      "\n",
      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 8: You are a meticulous and analytical expert in multiple disciplines, including law, entertainment, and science, tasked with providing detailed explanations for multiple-choice questions. Your role involves carefully reading each question, evaluating the provided options, and selecting the most appropriate answer based on your knowledge and reasoning skills. For each question, you will provide a step-by-step breakdown of your thought process, explaining why you chose a particular answer and why the other options are incorrect. Your goal is to not only provide the correct answer but also to educate and guide users through the reasoning process, helping them develop their critical thinking skills and improve their ability to tackle complex multiple-choice assessments.\n",
      "\n",
      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 9: To effectively utilize the language model for solving multiple-choice questions, provide the model with a clear and structured input that includes the question, the available options, and any relevant context or information. The model should then analyze the question, assess each option, and generate a step-by-step reasoning process to arrive at the correct answer. This process should be transparent, with the model explaining its thought process and the rationale behind its choice. The output should include the correct answer, along with a detailed explanation of how the model deduced this answer from the given options. The goal is to not only provide the correct answer but also to educate the user on the reasoning and critical thinking skills required to approach such questions. The model should adapt its explanations based on the user's level of expertise, ensuring that the information is presented in a clear and understandable manner. By doing so, the model will help users develop their critical thinking and analytical skills, enhancing their ability to tackle complex multiple-choice questions across various subjects.\n",
      "\n",
      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 10: You are a skilled analyst and educator with expertise in critical thinking, analytical reasoning, and multiple-choice question strategy. Your role involves guiding users through complex questions, evaluating answer options, and providing detailed, step-by-step explanations to facilitate deeper understanding and improved decision-making skills. By adapting your approach to the user's level of expertise, you aim to enhance their ability to analyze information, identify relevant details, and select the most appropriate answer. When presented with a question, carefully consider the context, assess each option, and generate a well-reasoned explanation for the correct answer, ensuring that your response is clear, structured, and informative.\n",
      "\n",
      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 11: To utilize the Predict module effectively, provide a clear and concise multiple-choice question along with the available options. Ensure the question is well-structured and relevant to the topic at hand, whether it pertains to legal, psychological, or any other domain. The Predict module is designed to analyze the question, evaluate the provided options, and generate a detailed, step-by-step reasoning process to arrive at the most appropriate answer. This process involves breaking down complex concepts into manageable parts, assessing the relevance and probative value of information, and applying principles from the respective field of study to support the conclusion. The goal is to not only select the correct answer but to foster a deeper understanding of the subject matter, enhance critical thinking skills, and promote confidence in tackling similar questions in the future. When formulating your question, consider the context, the principles or rules that apply, and how the options provided relate to the question's requirements. The Predict module will then use this information to produce a comprehensive explanation, making it an invaluable tool for educational purposes, professional development, and the refinement of analytical reasoning skills.\n",
      "\n",
      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 12: You are a critical component of a high-stakes decision-making system, and your task is to analyze complex multiple-choice questions, assess the provided answer choices, and generate step-by-step reasoning to arrive at the correct answer. The questions span various disciplines, including economics, law, physics, and general knowledge, requiring you to be versatile and knowledgeable. Your explanations must be clear, structured, and adapted to the user's level of expertise, as the consequences of incorrect decisions could be significant. For instance, in a real-world scenario, a incorrect answer could lead to financial losses, legal repercussions, or even physical harm. Your goal is to provide accurate and well-reasoned answers, fostering deeper comprehension and confidence in users as they tackle challenging assessments. You must evaluate each question carefully, considering the context, the subject matter, and the implications of each possible answer choice. By doing so, you will help users develop critical thinking skills, improve their decision-making process, and ultimately make informed decisions in high-pressure situations.\n",
      "\n",
      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 13: You are a skilled educator and subject matter expert, proficient in a wide range of disciplines, including economics, law, physics, and general knowledge. Your role is to analyze complex multiple-choice questions, evaluate the provided options, and generate detailed, step-by-step explanations to justify the correct answer. By doing so, you aim to enhance the user's understanding, foster critical thinking, and improve their ability to approach similar questions with confidence. When responding, please provide a clear and structured reasoning process, adapting your explanation to the user's level of expertise, and ensure that your answer includes the correct choice (A, B, C, D, etc.) along with a thorough justification for why it is the correct option.\n",
      "\n",
      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 14: You are a skilled educator with expertise in a wide range of subjects, including economics, law, physics, and general knowledge. Your role is to guide students through multiple-choice questions by providing detailed, step-by-step explanations that enhance their understanding and critical thinking skills. When presented with a question and a set of answer options, analyze the question carefully, assess each option, and then offer a well-reasoned explanation for your answer choice. Ensure that your explanations are clear, structured, and adapted to the student's level of expertise. Your goal is to not only help students arrive at the correct answer but also to foster deeper comprehension and confidence in tackling multiple-choice assessments across various disciplines.\n",
      "\n",
      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 15: To provide accurate and helpful responses to multiple-choice questions, analyze the question carefully, considering the subject matter and the specific concepts being tested. Evaluate each answer choice based on its relevance, accuracy, and alignment with the question's requirements. Offer step-by-step reasoning to justify the correct answer, breaking down complex concepts into clear and understandable parts. Ensure that the explanation is structured, easy to follow, and adapted to the user's level of expertise. The goal is to not only provide the correct answer but to also enhance the user's understanding of the subject matter, foster critical thinking skills, and build confidence in tackling similar questions in the future.\n",
      "\n",
      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 16: To tackle multiple-choice questions effectively, I will employ a systematic approach that involves thoroughly analyzing the question, carefully evaluating each answer choice, and providing detailed, step-by-step reasoning to justify the correct answer. This process will not only help in selecting the right option but also in understanding the underlying concepts and principles. The goal is to break down complex questions into manageable parts, assess each component critically, and synthesize the information to arrive at a well-reasoned conclusion. By doing so, the aim is to enhance comprehension, foster critical thinking, and build confidence in addressing multiple-choice assessments across various disciplines.\n",
      "\n",
      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 17: Act as an expert tutor guiding students through complex multiple-choice questions, providing detailed step-by-step reasoning for each answer choice. Your goal is to not only help students select the correct answer but also to enhance their understanding of the subject matter by breaking down intricate concepts into clear, manageable parts. As you analyze each question, consider the various disciplines and fields of study it may touch upon, such as economics, law, physics, or general knowledge, and tailor your explanations accordingly to foster a deeper and more interdisciplinary understanding. Ensure your reasoning is structured, easy to follow, and adapted to the student's level of expertise, promoting critical thinking, analytical reasoning, and confidence in tackling a wide range of multiple-choice assessments.\n",
      "\n",
      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 18: You are a critical member of an elite task force responsible for analyzing complex multiple-choice questions and providing accurate, well-reasoned explanations to support your answers. Your team has been entrusted with a high-stakes mission to tackle a series of challenging assessments, and your performance will directly impact the success of the operation. Each question you encounter requires meticulous analysis, careful consideration of the answer choices, and the ability to articulate clear, step-by-step reasoning to justify your conclusions. Your goal is to demonstrate exceptional critical thinking skills, adapt to diverse subjects and question formats, and consistently deliver accurate, confidence-inspiring answers. The fate of the mission rests on your ability to think critically, solve problems effectively, and communicate complex ideas with clarity and precision. You must analyze the question, evaluate the options, and provide a well-reasoned explanation for your answer choice, ensuring that your response is concise, informative, and free of ambiguity.\n",
      "\n",
      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 19: To address the task effectively, I propose the following instruction: \n",
      "\n",
      "\"Analyze the given question and options carefully, considering the context and any specific details provided. Break down complex concepts into manageable parts to assess each option's validity. Provide a step-by-step reasoning process to arrive at the correct answer, ensuring that the explanation is clear, structured, and adaptable to the user's level of expertise. The goal is to not only select the correct answer but also to enhance the user's understanding and critical thinking skills through a well-reasoned and detailed explanation.\n",
      "\n",
      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 20: Act as an expert tutor guiding students through complex multiple-choice questions across various subjects, including economics, law, physics, and general knowledge. Your task is to analyze the question, evaluate the provided options, and generate a step-by-step reasoning process that leads to the correct answer. Ensure your explanations are clear, structured, and adaptable to different levels of expertise, aiming to enhance critical thinking, analytical reasoning, and decision-making skills. When approaching a question, consider the context, identify key concepts, and apply relevant principles or theories to deduce the most appropriate answer. Your goal is to not only provide the correct answer but also to educate and foster a deeper understanding of the subject matter, enabling students to tackle similar questions with confidence and accuracy.\n",
      "\n",
      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 21: You are a critical component of an emergency response system designed to provide accurate and timely guidance on complex, high-stakes multiple-choice questions. In this scenario, a team of experts is racing against time to solve a series of critical problems, and their decisions will have far-reaching consequences. Your role is to analyze each question, assess the answer choices, and provide well-reasoned explanations to support the correct answer. The experts are relying on your guidance to make informed decisions, and the outcome of their efforts depends on the accuracy and clarity of your responses. You must break down complex concepts into clear, structured explanations, adapting your approach to the experts' level of expertise. Your goal is to empower the team to make confident, informed decisions, even in the most pressure-filled situations. You have the ability to ask for clarification or additional information if needed, but you must provide a final answer and explanation within a tight deadline. The fate of the mission rests on your ability to provide accurate and reliable guidance, so you must be meticulous, thorough, and clear in your analysis and explanations.\n",
      "\n",
      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 22: To address the task effectively, I will analyze the provided question and options, breaking down complex concepts into clear, step-by-step reasoning. My goal is to select the correct answer and provide a detailed explanation that enhances understanding and fosters critical thinking skills. I will adapt my response to the user's level of expertise, ensuring the information is presented in a structured and accessible manner. For each question, I will evaluate the options based on relevant knowledge and logical deductions, clearly outlining my thought process to facilitate learning and confidence in tackling multiple-choice assessments.\n",
      "\n",
      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 23: Analyze the given multiple-choice question and options, then provide a step-by-step reasoning process to determine the correct answer. Ensure the explanation is clear, structured, and adapted to the user's level of expertise, focusing on fostering deeper comprehension and confidence in tackling multiple-choice assessments.\n",
      "\n",
      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 24: You are a legal expert and educator with extensive experience in evidence law and trial procedures. Your role is to guide students and professionals through complex multiple-choice questions related to legal evidence, ensuring they understand the underlying principles and can apply them to real-world scenarios. Given a question and a set of options, analyze the scenario carefully, considering the relevance, reliability, and potential prejudice of the evidence in question. Then, provide a step-by-step reasoning process that leads to the correct answer, explaining the legal concepts and rules of evidence that apply. Your explanations should be clear, concise, and tailored to the user's level of expertise, aiming to enhance their critical thinking and decision-making skills in the context of legal evidence and trial procedures.\n",
      "\n",
      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
      "\n",
      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: Evaluating the default program...\n",
      "\n",
      "2025/01/29 23:15:54 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'Compute the area of the triangle whose altitudes have lengths 10, 12, and 15.', 'options': {'A': '56\\\\sqrt{15}', 'B': '240\\\\sqrt{7}', 'C': '60\\\\sqrt{7}', 'D': '240\\\\sqrt{7}/7'}, 'answer': 'D', 'reasoning': \"## Step 1: Recall the relationship between the area of a triangle and its altitudes\\nThe area \\\\(A\\\\) of a triangle can be found using the formula \\\\(A = \\\\frac{1}{2}bh\\\\), where \\\\(b\\\\) is the base of the triangle and \\\\(h\\\\) is the height (or altitude) of the triangle. However, when all three altitudes are known, we can use the formula for the area involving the altitudes \\\\(h_1\\\\), \\\\(h_2\\\\), and \\\\(h_3\\\\), and the sides \\\\(a\\\\), \\\\(b\\\\), and \\\\(c\\\\) of the triangle, given by \\\\(A = \\\\frac{1}{2}h_1a = \\\\frac{1}{2}h_2b = \\\\frac{1}{2}h_3c\\\\), but a more direct approach involves using Heron's formula in conjunction with the relationship between the altitudes and the sides.\\n\\n## Step 2: Use the formula that relates the area of a triangle directly to its altitudes\\nThere's a formula that relates the area \\\\(A\\\\) of a triangle directly to its altitudes \\\\(h_1\\\\), \\\\(h_2\\\\), and \\\\(h_3\\\\), which is \\\\(A = \\\\sqrt{\\\\frac{h_1h_2h_3}{2}(h_1 + h_2 + h_3)}\\\\) for certain types of triangles, but this isn't universally applicable. A more universally applicable method involves understanding that the area can also be expressed in terms of its inradius \\\\(r\\\\) and semiperimeter \\\\(s\\\\), \\\\(A = rs\\\\), and the relationship between the altitudes and the sides can give us \\\\(r\\\\) through \\\\(r = \\\\frac{A}{s}\\\\) and the formula \\\\(A = \\\\sqrt{s(s-a)(s-b)(s-c)}\\\\), but we need a direct link between altitudes and area.\\n\\n## Step 3: Correct Approach - Use the relationship between the altitudes and the area through the inradius\\nFor any triangle with altitudes \\\\(h_1\\\\), \\\\(h_2\\\\), and \\\\(h_3\\\\), and corresponding sides \\\\(a\\\\), \\\\(b\\\\), and \\\\(c\\\\), the area \\\\(A\\\\) can also be expressed as \\\\(A = \\\\frac{1}{2} \\\\cdot a \\\\cdot h_1 = \\\\frac{1}{2} \\\\cdot b \\\\cdot h_2 = \\\\frac{1}{2} \\\\cdot c \\\\cdot h_3\\\\). However, to directly compute the area from the altitudes without knowing the sides, we consider the formula \\\\(A = \\\\sqrt{\\\\frac{h_1h_2h_3}{2}(h_1 + h_2 + h_3)}\\\\) is not standard; instead, we should focus on the relationship \\\\(A = rs\\\\) where \\\\(r\\\\) is the inradius and \\\\(s\\\\) the semiperimeter. The inradius can be found from the altitudes and the sides through the formula for the area, but without the sides, we look for a relationship that directly ties altitudes to area.\\n\\n## Step 4: Apply the correct formula for the area using altitudes\\nGiven the altitudes \\\\(h_1 = 10\\\\), \\\\(h_2 = 12\\\\), and \\\\(h_3 = 15\\\\), we need to apply a correct and relevant formula that ties these directly to the area. However, the direct calculation of the area from altitudes without the sides or angles involves understanding that \\\\(h_1 = \\\\frac{2A}{a}\\\\), \\\\(h_2 = \\\\frac{2A}{b}\\\\), and \\\\(h_3 = \\\\frac{2A}{c}\\\\), which implies \\\\(A = \\\\frac{1}{2}ah_1 = \\\\frac{1}{2}bh_2 = \\\\frac{1}{2}ch_3\\\\). To find \\\\(A\\\\) directly from \\\\(h_1\\\\), \\\\(h_2\\\\), and \\\\(h_3\\\\), consider the relationship of the altitudes to the area and the sides, but note that a direct formula might not be straightforward without additional relationships.\\n\\n## Step 5: Realize the necessity of a different approach\\nGiven the complexity and the potential for confusion in directly applying a formula without clear relevance, reconsider the basic principles of geometry and how they apply to triangles, specifically how altitudes relate to the area and the sides. The area of a triangle given its altitudes directly might not be straightforward without considering the properties of right triangles or the use of Heron's formula in a creative way.\\n\\n## Step 6: Reflect on Heron's Formula and its applicability\\nHeron's formula states that the area \\\\(A\\\\) of a triangle whose sides have lengths \\\\(a\\\\), \\\\(b\\\\), and \\\\(c\\\\) is \\\\(A = \\\\sqrt{s(s-a)(s-b)(s-c)}\\\\), where \\\\(s\\\\) is the semiperimeter of the triangle, \\\\(s = \\\\frac{a + b + c}{2}\\\\). However, without the sides, directly applying Heron's formula is not possible.\\n\\n## Step 7: Consider the relationship between altitudes, area, and the inradius\\nThe area \\\\(A\\\\) of a triangle is also equal to \\\\(rs\\\\), where \\\\(r\\\\) is the inradius and \\\\(s\\\\) is the semiperimeter. The inradius can be related to the altitudes and the area, but this requires knowledge of the sides or a relationship that ties the altitudes directly to the inradius and semiperimeter.\\n\\n## Step 8: Final Calculation Approach\\nGiven the altitudes 10, 12, and 15, and recognizing the limitations of the previous steps in directly calculating the area without additional information or a clear, direct formula, we should reconsider the geometric properties that relate altitudes to the area and sides of a triangle. However, a common approach to solve this problem involves using the formula for the area based on the altitudes and the sides they intersect, but this typically requires knowledge of the triangle's sides or angles.\\n\\nThe best answer is D.\"}) (input_keys={'options', 'question'}): Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']). Set `provide_traceback=True` to see the stack trace.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average Metric: 271.00 / 299 (90.6%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [00:00<00:00, 2574.39it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/29 23:15:54 INFO dspy.evaluate.evaluate: Average Metric: 271.0 / 300 (90.3%)\n",
      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 90.33\n",
      "\n",
      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: ==> STEP 3: FINDING OPTIMAL PROMPT PARAMETERS <==\n",
      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: We will evaluate the program over a series of trials with different combinations of instructions and few-shot examples to find the optimal combination using Bayesian Optimization.\n",
      "\n",
      "/home/justinai/.conda/envs/prompt-migration/lib/python3.10/site-packages/optuna/_experimental.py:31: ExperimentalWarning: Argument ``multivariate`` is an experimental feature. The interface can change in the future.\n",
      "  warnings.warn(\n",
      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 1 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 23.00 / 25 (92.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:13<00:00,  1.88it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/29 23:16:07 INFO dspy.evaluate.evaluate: Average Metric: 23 / 25 (92.0%)\n",
      "2025/01/29 23:16:07 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 92.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 12'].\n",
      "2025/01/29 23:16:07 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0]\n",
      "2025/01/29 23:16:07 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33]\n",
      "2025/01/29 23:16:07 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
      "2025/01/29 23:16:07 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n",
      "\n",
      "\n",
      "2025/01/29 23:16:07 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 2 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 24.00 / 25 (96.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:16<00:00,  1.49it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/29 23:16:24 INFO dspy.evaluate.evaluate: Average Metric: 24 / 25 (96.0%)\n",
      "2025/01/29 23:16:24 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 96.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 1'].\n",
      "2025/01/29 23:16:24 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0]\n",
      "2025/01/29 23:16:24 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33]\n",
      "2025/01/29 23:16:24 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
      "2025/01/29 23:16:24 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n",
      "\n",
      "\n",
      "2025/01/29 23:16:24 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 3 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 25.00 / 25 (100.0%): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:13<00:00,  1.90it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/29 23:16:38 INFO dspy.evaluate.evaluate: Average Metric: 25 / 25 (100.0%)\n",
      "2025/01/29 23:16:38 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 100.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 14'].\n",
      "2025/01/29 23:16:38 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0]\n",
      "2025/01/29 23:16:38 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33]\n",
      "2025/01/29 23:16:38 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
      "2025/01/29 23:16:38 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n",
      "\n",
      "\n",
      "2025/01/29 23:16:38 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 4 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 22.00 / 25 (88.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:13<00:00,  1.91it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/29 23:16:51 INFO dspy.evaluate.evaluate: Average Metric: 22 / 25 (88.0%)\n",
      "2025/01/29 23:16:51 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 88.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 8'].\n",
      "2025/01/29 23:16:51 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0]\n",
      "2025/01/29 23:16:51 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33]\n",
      "2025/01/29 23:16:51 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
      "2025/01/29 23:16:51 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n",
      "\n",
      "\n",
      "2025/01/29 23:16:51 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 5 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 22.00 / 25 (88.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:19<00:00,  1.29it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/29 23:17:10 INFO dspy.evaluate.evaluate: Average Metric: 22 / 25 (88.0%)\n",
      "2025/01/29 23:17:10 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 88.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 13'].\n",
      "2025/01/29 23:17:10 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0]\n",
      "2025/01/29 23:17:10 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33]\n",
      "2025/01/29 23:17:10 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
      "2025/01/29 23:17:10 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n",
      "\n",
      "\n",
      "2025/01/29 23:17:10 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 6 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 23.00 / 25 (92.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:17<00:00,  1.46it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/29 23:17:27 INFO dspy.evaluate.evaluate: Average Metric: 23 / 25 (92.0%)\n",
      "2025/01/29 23:17:27 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 92.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 4'].\n",
      "2025/01/29 23:17:27 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0]\n",
      "2025/01/29 23:17:27 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33]\n",
      "2025/01/29 23:17:27 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
      "2025/01/29 23:17:27 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n",
      "\n",
      "\n",
      "2025/01/29 23:17:27 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 7 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 21.00 / 25 (84.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:14<00:00,  1.69it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/29 23:17:42 INFO dspy.evaluate.evaluate: Average Metric: 21 / 25 (84.0%)\n",
      "2025/01/29 23:17:42 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 84.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 10'].\n",
      "2025/01/29 23:17:42 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0]\n",
      "2025/01/29 23:17:42 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33]\n",
      "2025/01/29 23:17:42 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
      "2025/01/29 23:17:42 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n",
      "\n",
      "\n",
      "2025/01/29 23:17:42 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 8 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 21.00 / 25 (84.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:14<00:00,  1.74it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/29 23:17:57 INFO dspy.evaluate.evaluate: Average Metric: 21 / 25 (84.0%)\n",
      "2025/01/29 23:17:57 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 84.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 14'].\n",
      "2025/01/29 23:17:57 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0]\n",
      "2025/01/29 23:17:57 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33]\n",
      "2025/01/29 23:17:57 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
      "2025/01/29 23:17:57 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n",
      "\n",
      "\n",
      "2025/01/29 23:17:57 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 9 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 23.00 / 25 (92.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:42<00:00,  1.68s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/29 23:18:39 INFO dspy.evaluate.evaluate: Average Metric: 23 / 25 (92.0%)\n",
      "2025/01/29 23:18:39 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 92.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 10'].\n",
      "2025/01/29 23:18:39 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0]\n",
      "2025/01/29 23:18:39 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33]\n",
      "2025/01/29 23:18:39 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
      "2025/01/29 23:18:39 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n",
      "\n",
      "\n",
      "2025/01/29 23:18:39 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 10 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 22.00 / 25 (88.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:14<00:00,  1.72it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/29 23:18:54 INFO dspy.evaluate.evaluate: Average Metric: 22 / 25 (88.0%)\n",
      "2025/01/29 23:18:54 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 88.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 10'].\n",
      "2025/01/29 23:18:54 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0, 88.0]\n",
      "2025/01/29 23:18:54 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33]\n",
      "2025/01/29 23:18:54 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
      "2025/01/29 23:18:54 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
      "\n",
      "\n",
      "2025/01/29 23:18:54 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Full Eval 1 =====\n",
      "2025/01/29 23:18:54 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 96.0) from minibatch trials...\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 205.00 / 230 (89.1%):  77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                               | 230/300 [01:07<00:15,  4.47it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/29 23:20:01 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'Compute the area of the triangle whose altitudes have lengths 10, 12, and 15.', 'options': {'A': '56\\\\sqrt{15}', 'B': '240\\\\sqrt{7}', 'C': '60\\\\sqrt{7}', 'D': '240\\\\sqrt{7}/7'}, 'answer': 'D', 'reasoning': \"## Step 1: Recall the relationship between the area of a triangle and its altitudes\\nThe area \\\\(A\\\\) of a triangle can be found using the formula \\\\(A = \\\\frac{1}{2}bh\\\\), where \\\\(b\\\\) is the base of the triangle and \\\\(h\\\\) is the height (or altitude) of the triangle. However, when all three altitudes are known, we can use the formula for the area involving the altitudes \\\\(h_1\\\\), \\\\(h_2\\\\), and \\\\(h_3\\\\), and the sides \\\\(a\\\\), \\\\(b\\\\), and \\\\(c\\\\) of the triangle, given by \\\\(A = \\\\frac{1}{2}h_1a = \\\\frac{1}{2}h_2b = \\\\frac{1}{2}h_3c\\\\), but a more direct approach involves using Heron's formula in conjunction with the relationship between the altitudes and the sides.\\n\\n## Step 2: Use the formula that relates the area of a triangle directly to its altitudes\\nThere's a formula that relates the area \\\\(A\\\\) of a triangle directly to its altitudes \\\\(h_1\\\\), \\\\(h_2\\\\), and \\\\(h_3\\\\), which is \\\\(A = \\\\sqrt{\\\\frac{h_1h_2h_3}{2}(h_1 + h_2 + h_3)}\\\\) for certain types of triangles, but this isn't universally applicable. A more universally applicable method involves understanding that the area can also be expressed in terms of its inradius \\\\(r\\\\) and semiperimeter \\\\(s\\\\), \\\\(A = rs\\\\), and the relationship between the altitudes and the sides can give us \\\\(r\\\\) through \\\\(r = \\\\frac{A}{s}\\\\) and the formula \\\\(A = \\\\sqrt{s(s-a)(s-b)(s-c)}\\\\), but we need a direct link between altitudes and area.\\n\\n## Step 3: Correct Approach - Use the relationship between the altitudes and the area through the inradius\\nFor any triangle with altitudes \\\\(h_1\\\\), \\\\(h_2\\\\), and \\\\(h_3\\\\), and corresponding sides \\\\(a\\\\), \\\\(b\\\\), and \\\\(c\\\\), the area \\\\(A\\\\) can also be expressed as \\\\(A = \\\\frac{1}{2} \\\\cdot a \\\\cdot h_1 = \\\\frac{1}{2} \\\\cdot b \\\\cdot h_2 = \\\\frac{1}{2} \\\\cdot c \\\\cdot h_3\\\\). However, to directly compute the area from the altitudes without knowing the sides, we consider the formula \\\\(A = \\\\sqrt{\\\\frac{h_1h_2h_3}{2}(h_1 + h_2 + h_3)}\\\\) is not standard; instead, we should focus on the relationship \\\\(A = rs\\\\) where \\\\(r\\\\) is the inradius and \\\\(s\\\\) the semiperimeter. The inradius can be found from the altitudes and the sides through the formula for the area, but without the sides, we look for a relationship that directly ties altitudes to area.\\n\\n## Step 4: Apply the correct formula for the area using altitudes\\nGiven the altitudes \\\\(h_1 = 10\\\\), \\\\(h_2 = 12\\\\), and \\\\(h_3 = 15\\\\), we need to apply a correct and relevant formula that ties these directly to the area. However, the direct calculation of the area from altitudes without the sides or angles involves understanding that \\\\(h_1 = \\\\frac{2A}{a}\\\\), \\\\(h_2 = \\\\frac{2A}{b}\\\\), and \\\\(h_3 = \\\\frac{2A}{c}\\\\), which implies \\\\(A = \\\\frac{1}{2}ah_1 = \\\\frac{1}{2}bh_2 = \\\\frac{1}{2}ch_3\\\\). To find \\\\(A\\\\) directly from \\\\(h_1\\\\), \\\\(h_2\\\\), and \\\\(h_3\\\\), consider the relationship of the altitudes to the area and the sides, but note that a direct formula might not be straightforward without additional relationships.\\n\\n## Step 5: Realize the necessity of a different approach\\nGiven the complexity and the potential for confusion in directly applying a formula without clear relevance, reconsider the basic principles of geometry and how they apply to triangles, specifically how altitudes relate to the area and the sides. The area of a triangle given its altitudes directly might not be straightforward without considering the properties of right triangles or the use of Heron's formula in a creative way.\\n\\n## Step 6: Reflect on Heron's Formula and its applicability\\nHeron's formula states that the area \\\\(A\\\\) of a triangle whose sides have lengths \\\\(a\\\\), \\\\(b\\\\), and \\\\(c\\\\) is \\\\(A = \\\\sqrt{s(s-a)(s-b)(s-c)}\\\\), where \\\\(s\\\\) is the semiperimeter of the triangle, \\\\(s = \\\\frac{a + b + c}{2}\\\\). However, without the sides, directly applying Heron's formula is not possible.\\n\\n## Step 7: Consider the relationship between altitudes, area, and the inradius\\nThe area \\\\(A\\\\) of a triangle is also equal to \\\\(rs\\\\), where \\\\(r\\\\) is the inradius and \\\\(s\\\\) is the semiperimeter. The inradius can be related to the altitudes and the area, but this requires knowledge of the sides or a relationship that ties the altitudes directly to the inradius and semiperimeter.\\n\\n## Step 8: Final Calculation Approach\\nGiven the altitudes 10, 12, and 15, and recognizing the limitations of the previous steps in directly calculating the area without additional information or a clear, direct formula, we should reconsider the geometric properties that relate altitudes to the area and sides of a triangle. However, a common approach to solve this problem involves using the formula for the area based on the altitudes and the sides they intersect, but this typically requires knowledge of the triangle's sides or angles.\\n\\nThe best answer is D.\"}) (input_keys={'options', 'question'}): Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']). Set `provide_traceback=True` to see the stack trace.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average Metric: 264.00 / 299 (88.3%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [01:55<00:00,  2.59it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/29 23:20:50 INFO dspy.evaluate.evaluate: Average Metric: 264.0 / 300 (88.0%)\n",
      "2025/01/29 23:20:50 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0]\n",
      "2025/01/29 23:20:50 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
      "2025/01/29 23:20:50 INFO dspy.teleprompt.mipro_optimizer_v2: =======================\n",
      "2025/01/29 23:20:50 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
      "\n",
      "2025/01/29 23:20:50 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 11 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 24.00 / 25 (96.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:12<00:00,  1.94it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/29 23:21:03 INFO dspy.evaluate.evaluate: Average Metric: 24 / 25 (96.0%)\n",
      "2025/01/29 23:21:03 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 96.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 23'].\n",
      "2025/01/29 23:21:03 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0, 88.0, 96.0]\n",
      "2025/01/29 23:21:03 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0]\n",
      "2025/01/29 23:21:03 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
      "2025/01/29 23:21:03 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
      "\n",
      "\n",
      "2025/01/29 23:21:03 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 12 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 24.00 / 25 (96.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:00<00:00, 2624.00it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/29 23:21:03 INFO dspy.evaluate.evaluate: Average Metric: 24 / 25 (96.0%)\n",
      "2025/01/29 23:21:03 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 96.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 1'].\n",
      "2025/01/29 23:21:03 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0, 88.0, 96.0, 96.0]\n",
      "2025/01/29 23:21:03 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0]\n",
      "2025/01/29 23:21:03 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
      "2025/01/29 23:21:03 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
      "\n",
      "\n",
      "2025/01/29 23:21:03 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 13 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 21.00 / 25 (84.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:15<00:00,  1.63it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/29 23:21:18 INFO dspy.evaluate.evaluate: Average Metric: 21 / 25 (84.0%)\n",
      "2025/01/29 23:21:18 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 84.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 2'].\n",
      "2025/01/29 23:21:18 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0, 88.0, 96.0, 96.0, 84.0]\n",
      "2025/01/29 23:21:18 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0]\n",
      "2025/01/29 23:21:18 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
      "2025/01/29 23:21:18 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
      "\n",
      "\n",
      "2025/01/29 23:21:18 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 14 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 23.00 / 25 (92.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:12<00:00,  1.97it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/29 23:21:31 INFO dspy.evaluate.evaluate: Average Metric: 23 / 25 (92.0%)\n",
      "2025/01/29 23:21:31 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 92.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 14'].\n",
      "2025/01/29 23:21:31 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0, 88.0, 96.0, 96.0, 84.0, 92.0]\n",
      "2025/01/29 23:21:31 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0]\n",
      "2025/01/29 23:21:31 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
      "2025/01/29 23:21:31 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
      "\n",
      "\n",
      "2025/01/29 23:21:31 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 15 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 22.00 / 25 (88.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:13<00:00,  1.86it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/29 23:21:44 INFO dspy.evaluate.evaluate: Average Metric: 22 / 25 (88.0%)\n",
      "2025/01/29 23:21:44 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 88.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 9'].\n",
      "2025/01/29 23:21:44 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0, 88.0, 96.0, 96.0, 84.0, 92.0, 88.0]\n",
      "2025/01/29 23:21:44 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0]\n",
      "2025/01/29 23:21:44 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
      "2025/01/29 23:21:44 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
      "\n",
      "\n",
      "2025/01/29 23:21:44 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 16 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 22.00 / 25 (88.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:00<00:00, 2597.48it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/29 23:21:44 INFO dspy.evaluate.evaluate: Average Metric: 22 / 25 (88.0%)\n",
      "2025/01/29 23:21:44 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 88.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 1'].\n",
      "2025/01/29 23:21:44 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0, 88.0, 96.0, 96.0, 84.0, 92.0, 88.0, 88.0]\n",
      "2025/01/29 23:21:44 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0]\n",
      "2025/01/29 23:21:44 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
      "2025/01/29 23:21:44 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
      "\n",
      "\n",
      "2025/01/29 23:21:44 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 17 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 21.00 / 25 (84.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:41<00:00,  1.67s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/29 23:22:26 INFO dspy.evaluate.evaluate: Average Metric: 21 / 25 (84.0%)\n",
      "2025/01/29 23:22:26 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 84.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 11'].\n",
      "2025/01/29 23:22:26 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0, 88.0, 96.0, 96.0, 84.0, 92.0, 88.0, 88.0, 84.0]\n",
      "2025/01/29 23:22:26 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0]\n",
      "2025/01/29 23:22:26 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
      "2025/01/29 23:22:26 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
      "\n",
      "\n",
      "2025/01/29 23:22:26 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 18 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 22.00 / 25 (88.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:14<00:00,  1.72it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/29 23:22:41 INFO dspy.evaluate.evaluate: Average Metric: 22 / 25 (88.0%)\n",
      "2025/01/29 23:22:41 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 88.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 5'].\n",
      "2025/01/29 23:22:41 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0, 88.0, 96.0, 96.0, 84.0, 92.0, 88.0, 88.0, 84.0, 88.0]\n",
      "2025/01/29 23:22:41 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0]\n",
      "2025/01/29 23:22:41 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
      "2025/01/29 23:22:41 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
      "\n",
      "\n",
      "2025/01/29 23:22:41 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 19 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 20.00 / 25 (80.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:18<00:00,  1.36it/s]                                                      "
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/29 23:22:59 INFO dspy.evaluate.evaluate: Average Metric: 20 / 25 (80.0%)\n",
      "2025/01/29 23:22:59 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 80.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 17'].\n",
      "2025/01/29 23:22:59 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0, 88.0, 96.0, 96.0, 84.0, 92.0, 88.0, 88.0, 84.0, 88.0, 80.0]\n",
      "2025/01/29 23:22:59 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0]\n",
      "2025/01/29 23:22:59 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
      "2025/01/29 23:22:59 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
      "\n",
      "\n",
      "2025/01/29 23:22:59 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 20 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 22.00 / 25 (88.0%): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:42<00:00,  1.72s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/29 23:23:42 INFO dspy.evaluate.evaluate: Average Metric: 22 / 25 (88.0%)\n",
      "2025/01/29 23:23:42 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 88.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 22'].\n",
      "2025/01/29 23:23:42 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0, 88.0, 96.0, 96.0, 84.0, 92.0, 88.0, 88.0, 84.0, 88.0, 80.0, 88.0]\n",
      "2025/01/29 23:23:42 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0]\n",
      "2025/01/29 23:23:42 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
      "2025/01/29 23:23:42 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
      "\n",
      "\n",
      "2025/01/29 23:23:42 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Full Eval 2 =====\n",
      "2025/01/29 23:23:42 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 96.0) from minibatch trials...\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 262.00 / 300 (87.3%): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [01:29<00:00,  3.36it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/29 23:25:12 INFO dspy.evaluate.evaluate: Average Metric: 262 / 300 (87.3%)\n",
      "2025/01/29 23:25:12 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0, 87.33]\n",
      "2025/01/29 23:25:12 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
      "2025/01/29 23:25:12 INFO dspy.teleprompt.mipro_optimizer_v2: =======================\n",
      "2025/01/29 23:25:12 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
      "\n",
      "2025/01/29 23:25:12 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 21 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 21.00 / 25 (84.0%): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:40<00:00,  1.61s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/29 23:25:52 INFO dspy.evaluate.evaluate: Average Metric: 21 / 25 (84.0%)\n",
      "2025/01/29 23:25:52 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 84.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 20'].\n",
      "2025/01/29 23:25:52 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0, 88.0, 96.0, 96.0, 84.0, 92.0, 88.0, 88.0, 84.0, 88.0, 80.0, 88.0, 84.0]\n",
      "2025/01/29 23:25:52 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0, 87.33]\n",
      "2025/01/29 23:25:52 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
      "2025/01/29 23:25:52 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
      "\n",
      "\n",
      "2025/01/29 23:25:52 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 22 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 22.00 / 25 (88.0%): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:00<00:00, 2398.33it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/29 23:25:52 INFO dspy.evaluate.evaluate: Average Metric: 22 / 25 (88.0%)\n",
      "2025/01/29 23:25:52 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 88.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 23'].\n",
      "2025/01/29 23:25:52 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0, 88.0, 96.0, 96.0, 84.0, 92.0, 88.0, 88.0, 84.0, 88.0, 80.0, 88.0, 84.0, 88.0]\n",
      "2025/01/29 23:25:52 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0, 87.33]\n",
      "2025/01/29 23:25:52 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
      "2025/01/29 23:25:52 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
      "\n",
      "\n",
      "2025/01/29 23:25:52 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 23 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 23.00 / 25 (92.0%): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:39<00:00,  1.56s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/29 23:26:32 INFO dspy.evaluate.evaluate: Average Metric: 23 / 25 (92.0%)\n",
      "2025/01/29 23:26:32 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 92.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 24'].\n",
      "2025/01/29 23:26:32 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0, 88.0, 96.0, 96.0, 84.0, 92.0, 88.0, 88.0, 84.0, 88.0, 80.0, 88.0, 84.0, 88.0, 92.0]\n",
      "2025/01/29 23:26:32 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0, 87.33]\n",
      "2025/01/29 23:26:32 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
      "2025/01/29 23:26:32 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
      "\n",
      "\n",
      "2025/01/29 23:26:32 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 24 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 22.00 / 25 (88.0%): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:00<00:00, 2904.08it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/29 23:26:32 INFO dspy.evaluate.evaluate: Average Metric: 22 / 25 (88.0%)\n",
      "2025/01/29 23:26:32 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 88.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 0'].\n",
      "2025/01/29 23:26:32 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0, 88.0, 96.0, 96.0, 84.0, 92.0, 88.0, 88.0, 84.0, 88.0, 80.0, 88.0, 84.0, 88.0, 92.0, 88.0]\n",
      "2025/01/29 23:26:32 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0, 87.33]\n",
      "2025/01/29 23:26:32 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
      "2025/01/29 23:26:32 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
      "\n",
      "\n",
      "2025/01/29 23:26:32 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 25 / 25 ==\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 22.00 / 25 (88.0%): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:43<00:00,  1.74s/it]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/29 23:27:15 INFO dspy.evaluate.evaluate: Average Metric: 22 / 25 (88.0%)\n",
      "2025/01/29 23:27:15 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 88.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 7'].\n",
      "2025/01/29 23:27:15 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0, 88.0, 96.0, 96.0, 84.0, 92.0, 88.0, 88.0, 84.0, 88.0, 80.0, 88.0, 84.0, 88.0, 92.0, 88.0, 88.0]\n",
      "2025/01/29 23:27:15 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0, 87.33]\n",
      "2025/01/29 23:27:15 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
      "2025/01/29 23:27:15 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
      "\n",
      "\n",
      "2025/01/29 23:27:15 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Full Eval 3 =====\n",
      "2025/01/29 23:27:15 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 92.0) from minibatch trials...\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 267.00 / 300 (89.0%): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [01:30<00:00,  3.30it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/29 23:28:46 INFO dspy.evaluate.evaluate: Average Metric: 267 / 300 (89.0%)\n",
      "2025/01/29 23:28:46 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0, 87.33, 89.0]\n",
      "2025/01/29 23:28:46 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
      "2025/01/29 23:28:46 INFO dspy.teleprompt.mipro_optimizer_v2: =======================\n",
      "2025/01/29 23:28:46 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
      "\n",
      "2025/01/29 23:28:46 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 90.33!\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "CPU times: user 18.3 s, sys: 3.02 s, total: 21.3 s\n",
      "Wall time: 23min 6s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "subset_size = 500\n",
    "optimizer = dspy.MIPROv2(\n",
    "    metric=benchmark.metric,\n",
    "    auto=\"medium\",\n",
    "    num_threads=NUM_THREADS,\n",
    "    task_model=TASK_MODEL,\n",
    "    prompt_model=PROMPT_MODEL,\n",
    "    max_bootstrapped_demos=0,\n",
    "    max_labeled_demos=FEW_SHOTS,\n",
    ")\n",
    "\n",
    "optimized_program = optimizer.compile(\n",
    "    program,\n",
    "    trainset=trainset[:subset_size],\n",
    "    valset=valset[:subset_size],\n",
    "    requires_permission_to_run=False,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "BEST PROMPT:\n",
      " You are a knowledgeable and insightful assistant designed to provide guidance on multiple-choice questions. Your role involves analyzing questions, assessing answer choices, and offering well-reasoned explanations to enhance understanding. By breaking down complex concepts, you help users develop critical thinking skills and improve their decision-making process. You strive to present information in a clear, structured manner while adapting to the user's level of expertise. Ultimately, your goal is to foster deeper comprehension and confidence in tackling multiple-choice assessments.\n"
     ]
    }
   ],
   "source": [
    "print(\"BEST PROMPT:\\n\", optimized_program.signature.instructions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "BEST EXAMPLES:\n",
      " []\n"
     ]
    }
   ],
   "source": [
    "print(\"BEST EXAMPLES:\\n\", optimized_program.demos)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "ename": "AttributeError",
     "evalue": "'ChainOfThought' object has no attribute 'predict'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[48], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m examples_json \u001b[38;5;241m=\u001b[39m [example\u001b[38;5;241m.\u001b[39mto_dict() \u001b[38;5;28;01mfor\u001b[39;00m example \u001b[38;5;129;01min\u001b[39;00m \u001b[43moptimized_program\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpredict\u001b[49m\u001b[38;5;241m.\u001b[39mdemos]\n\u001b[1;32m      2\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mBEST EXAMPLES:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m, json\u001b[38;5;241m.\u001b[39mdumps(examples_json, indent\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m2\u001b[39m))\n",
      "\u001b[0;31mAttributeError\u001b[0m: 'ChainOfThought' object has no attribute 'predict'"
     ]
    }
   ],
   "source": [
    "examples_json = [example.to_dict() for example in optimized_program.predict.demos]\n",
    "print(\"BEST EXAMPLES:\\n\", json.dumps(examples_json, indent=2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average Metric: 453.00 / 500 (90.6%): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:42<00:00,  4.90it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/29 23:32:56 INFO dspy.evaluate.evaluate: Average Metric: 453 / 500 (90.6%)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>question</th>\n",
       "      <th>options</th>\n",
       "      <th>example_answer</th>\n",
       "      <th>example_reasoning</th>\n",
       "      <th>pred_reasoning</th>\n",
       "      <th>pred_answer</th>\n",
       "      <th>metric</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>For which of these two scenarios does the main character (who uses...</td>\n",
       "      <td>{'A': 'Wrong, Wrong', 'B': 'Wrong, Not wrong', 'C': 'Not wrong, Wr...</td>\n",
       "      <td>A</td>\n",
       "      <td>## Step 1: Evaluate the first scenario In the first scenario, the ...</td>\n",
       "      <td>To determine which scenario involves the main character doing some...</td>\n",
       "      <td>A</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Two lists, list1 and list2, contain the names of books found in tw...</td>\n",
       "      <td>{'A': 'newList ← Combine (listl, list2)\\n newList ← Sort (newList)...</td>\n",
       "      <td>A</td>\n",
       "      <td>To create newList, which contains the names of all books found in ...</td>\n",
       "      <td>To create newList, which contains the names of all books found in ...</td>\n",
       "      <td>A</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>How might the recent global economic crisis be viewed as a challen...</td>\n",
       "      <td>{'A': 'Governmental cyberspace restrictions, in the form of censor...</td>\n",
       "      <td>C</td>\n",
       "      <td>## Step 1: Understanding the liberalist perspective The liberalist...</td>\n",
       "      <td>The liberalist perspective emphasizes the importance of free marke...</td>\n",
       "      <td>C</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>What term is used to represent unavoidable past costs that cannot ...</td>\n",
       "      <td>{'A': 'Prime costs', 'B': 'Sunk costs', 'C': 'Opportunity costs', ...</td>\n",
       "      <td>B</td>\n",
       "      <td>## Step 1: Understanding the concept of sunk costs Sunk costs are ...</td>\n",
       "      <td>To answer this question, we need to understand the concept of each...</td>\n",
       "      <td>B</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Markson Co. traded a concrete-mixing truck with a book value of $1...</td>\n",
       "      <td>{'A': 'Does the book value of the asset given up exceed the fair v...</td>\n",
       "      <td>C</td>\n",
       "      <td>## Step 1: Understand the concept of commercial substance in asset...</td>\n",
       "      <td>To determine whether an exchange of assets has commercial substanc...</td>\n",
       "      <td>C</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>495</th>\n",
       "      <td>A man is charged with murder. During the trial, defense counsel of...</td>\n",
       "      <td>{'A': 'not hearsay.', 'B': 'hearsay, but admissible as an admissio...</td>\n",
       "      <td>D</td>\n",
       "      <td>To answer this question, let's break it down step by step: ## Step...</td>\n",
       "      <td>The testimony in question involves a statement made by a man on de...</td>\n",
       "      <td>D</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>496</th>\n",
       "      <td>Two men held-up a liquor store in a city. During the robbery, one ...</td>\n",
       "      <td>{'A': 'granted, because the prosecutor is constitutionally require...</td>\n",
       "      <td>B</td>\n",
       "      <td>To answer this question, we need to consider the legal implication...</td>\n",
       "      <td>The defendant's motion to dismiss the indictment due to the delay ...</td>\n",
       "      <td>C</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>497</th>\n",
       "      <td>Which vitamins are important in lowering circulating homocysteine ...</td>\n",
       "      <td>{'A': 'Vitamin D', 'B': 'Vitamin C', 'C': 'Vitamin A', 'D': 'Folat...</td>\n",
       "      <td>D</td>\n",
       "      <td>## Step 1: Understanding the role of vitamins in homocysteine leve...</td>\n",
       "      <td>To answer this question, we need to consider the role of vitamins ...</td>\n",
       "      <td>D</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>498</th>\n",
       "      <td>This question refers to the following information. \"The greatest c...</td>\n",
       "      <td>{'A': 'African nations will not achieve independence without unity...</td>\n",
       "      <td>D</td>\n",
       "      <td>## Step 1: Understand the context of Nkrumah's statement Nkrumah e...</td>\n",
       "      <td>To answer this question, we need to understand the context and the...</td>\n",
       "      <td>D</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>499</th>\n",
       "      <td>Millions of immigrant children who entered the United States learn...</td>\n",
       "      <td>{'A': 'Acculturation', 'B': 'Collective behavior', 'C': 'Social st...</td>\n",
       "      <td>A</td>\n",
       "      <td>## Step 1: Understanding the concept of acculturation Acculturatio...</td>\n",
       "      <td>The concept described in the question involves immigrant children ...</td>\n",
       "      <td>A</td>\n",
       "      <td>✔️ [True]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>500 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                                  question  \\\n",
       "0    For which of these two scenarios does the main character (who uses...   \n",
       "1    Two lists, list1 and list2, contain the names of books found in tw...   \n",
       "2    How might the recent global economic crisis be viewed as a challen...   \n",
       "3    What term is used to represent unavoidable past costs that cannot ...   \n",
       "4    Markson Co. traded a concrete-mixing truck with a book value of $1...   \n",
       "..                                                                     ...   \n",
       "495  A man is charged with murder. During the trial, defense counsel of...   \n",
       "496  Two men held-up a liquor store in a city. During the robbery, one ...   \n",
       "497  Which vitamins are important in lowering circulating homocysteine ...   \n",
       "498  This question refers to the following information. \"The greatest c...   \n",
       "499  Millions of immigrant children who entered the United States learn...   \n",
       "\n",
       "                                                                   options  \\\n",
       "0    {'A': 'Wrong, Wrong', 'B': 'Wrong, Not wrong', 'C': 'Not wrong, Wr...   \n",
       "1    {'A': 'newList ← Combine (listl, list2)\\n newList ← Sort (newList)...   \n",
       "2    {'A': 'Governmental cyberspace restrictions, in the form of censor...   \n",
       "3    {'A': 'Prime costs', 'B': 'Sunk costs', 'C': 'Opportunity costs', ...   \n",
       "4    {'A': 'Does the book value of the asset given up exceed the fair v...   \n",
       "..                                                                     ...   \n",
       "495  {'A': 'not hearsay.', 'B': 'hearsay, but admissible as an admissio...   \n",
       "496  {'A': 'granted, because the prosecutor is constitutionally require...   \n",
       "497  {'A': 'Vitamin D', 'B': 'Vitamin C', 'C': 'Vitamin A', 'D': 'Folat...   \n",
       "498  {'A': 'African nations will not achieve independence without unity...   \n",
       "499  {'A': 'Acculturation', 'B': 'Collective behavior', 'C': 'Social st...   \n",
       "\n",
       "    example_answer  \\\n",
       "0                A   \n",
       "1                A   \n",
       "2                C   \n",
       "3                B   \n",
       "4                C   \n",
       "..             ...   \n",
       "495              D   \n",
       "496              B   \n",
       "497              D   \n",
       "498              D   \n",
       "499              A   \n",
       "\n",
       "                                                         example_reasoning  \\\n",
       "0    ## Step 1: Evaluate the first scenario In the first scenario, the ...   \n",
       "1    To create newList, which contains the names of all books found in ...   \n",
       "2    ## Step 1: Understanding the liberalist perspective The liberalist...   \n",
       "3    ## Step 1: Understanding the concept of sunk costs Sunk costs are ...   \n",
       "4    ## Step 1: Understand the concept of commercial substance in asset...   \n",
       "..                                                                     ...   \n",
       "495  To answer this question, let's break it down step by step: ## Step...   \n",
       "496  To answer this question, we need to consider the legal implication...   \n",
       "497  ## Step 1: Understanding the role of vitamins in homocysteine leve...   \n",
       "498  ## Step 1: Understand the context of Nkrumah's statement Nkrumah e...   \n",
       "499  ## Step 1: Understanding the concept of acculturation Acculturatio...   \n",
       "\n",
       "                                                            pred_reasoning  \\\n",
       "0    To determine which scenario involves the main character doing some...   \n",
       "1    To create newList, which contains the names of all books found in ...   \n",
       "2    The liberalist perspective emphasizes the importance of free marke...   \n",
       "3    To answer this question, we need to understand the concept of each...   \n",
       "4    To determine whether an exchange of assets has commercial substanc...   \n",
       "..                                                                     ...   \n",
       "495  The testimony in question involves a statement made by a man on de...   \n",
       "496  The defendant's motion to dismiss the indictment due to the delay ...   \n",
       "497  To answer this question, we need to consider the role of vitamins ...   \n",
       "498  To answer this question, we need to understand the context and the...   \n",
       "499  The concept described in the question involves immigrant children ...   \n",
       "\n",
       "    pred_answer     metric  \n",
       "0             A  ✔️ [True]  \n",
       "1             A  ✔️ [True]  \n",
       "2             C  ✔️ [True]  \n",
       "3             B  ✔️ [True]  \n",
       "4             C  ✔️ [True]  \n",
       "..          ...        ...  \n",
       "495           D  ✔️ [True]  \n",
       "496           C             \n",
       "497           D  ✔️ [True]  \n",
       "498           D  ✔️ [True]  \n",
       "499           A  ✔️ [True]  \n",
       "\n",
       "[500 rows x 7 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 4.91 s, sys: 587 ms, total: 5.49 s\n",
      "Wall time: 1min 42s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "score, results, all_scores = evaluate(\n",
    "    optimized_program,\n",
    "    devset=testset[:500],\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Heavy Optimization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "%%time\n",
    "optimizer = dspy.MIPROv2(\n",
    "    metric=benchmark.metric,\n",
    "    auto=\"heavy\",\n",
    "    num_threads=NUM_THREADS,\n",
    "    task_model=TASK_MODEL,\n",
    "    prompt_model=PROMPT_MODEL,\n",
    "    max_labeled_demos=FEW_SHOTS,\n",
    ")\n",
    "\n",
    "optimized_program = optimizer.compile(\n",
    "    program,\n",
    "    trainset=trainset,\n",
    "    valset=valset,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"BEST PROMPT:\\n\", optimized_program.predict.signature.instructions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"BEST EXAMPLES:\\n\", optimized_program.predict.demos)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "score, results, all_scores = evaluate(\n",
    "    optimized_program,\n",
    "    devset=testset,\n",
    "    display_table=False,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}