{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Imports & Env Setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "%reload_ext autoreload\n",
    "%autoreload 2\n",
    "import sys\n",
    "import os\n",
    "from dotenv import load_dotenv\n",
    "load_dotenv()\n",
    "from datasets import load_dataset\n",
    "\n",
    "\n",
    "import dspy\n",
    "sys.path.append(os.path.abspath('../'))\n",
    "from benchmarks import llama_mmlu_pro, leaderboard_mmlu_pro"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Configuration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "NUM_THREADS = 48\n",
    "\n",
    "FEW_SHOTS = 5\n",
    "\n",
    "# See https://docs.litellm.ai/docs/providers/vllm for details\n",
    "TASK_MODEL = dspy.LM(\n",
    "    \"hosted_vllm/meta-llama/Llama-3.3-70B-Instruct\",\n",
    "    api_base = 'http://localhost:8000/v1' , # or api_base ?\n",
    "    api_key = \"dummy\",\n",
    "    # api_version: Optional[str] = None,\n",
    "    # api_key: Optional[str] = None,\n",
    "    # seed: Optional[int] = None,\n",
    "    # max_tokens: Optional[int] = None,\n",
    "    # timeout: Optional[Union[float, int]] = None,\n",
    ")\n",
    "PROMPT_MODEL = dspy.LM(\n",
    "    \"hosted_vllm/meta-llama/Llama-3.3-70B-Instruct\",\n",
    "    api_base = 'http://localhost:8000/v1', # or api_base ?\n",
    "    api_key = \"dummy\",\n",
    "\n",
    "    # api_version: Optional[str] = None,\n",
    "    # api_key: Optional[str] = None,\n",
    "    # seed: Optional[int] = None,\n",
    "    # max_tokens: Optional[int] = None,\n",
    "    # timeout: Optional[Union[float, int]] = None,\n",
    ")\n",
    "\n",
    "dspy.configure(lm=TASK_MODEL)\n",
    "\n",
    "# replace this with llama_mmlu_pro or whatever\n",
    "benchmark = llama_mmlu_pro\n",
    "\n",
    "# Without chain of thought:\n",
    "# program = dspy.Predict(\n",
    "#     benchmark.signature(\"\")\n",
    "# )\n",
    "\n",
    "# With chain of thought:\n",
    "program = dspy.ChainOfThought(\n",
    "    benchmark.signature(\"You are a helpful assistant designed to help with multiple choice question.\") # put your initial system prompt here, or leave blank\n",
    ")\n",
    "\n",
    "evaluate = dspy.Evaluate(\n",
    "    devset=[],\n",
    "    max_errors = 500,\n",
    "    metric=benchmark.metric,\n",
    "    num_threads=NUM_THREADS,\n",
    "    display_progress=True,\n",
    "    display_table=True,\n",
    "    return_all_scores=True,\n",
    "    return_outputs=True,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1000, 200, 10779)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trainset, valset, testset = benchmark.datasets(\n",
    "    train_size=0.1,\n",
    "    validation_size=0.2,\n",
    ")\n",
    "\n",
    "len(trainset), len(valset), len(testset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "11979"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset = load_dataset(\n",
    "    \"meta-llama/Llama-3.3-70B-Instruct-evals\",\n",
    "    \"Llama-3.3-70B-Instruct-evals__mmlu_pro__details\",\n",
    ")\n",
    "full_dataset = list(map(benchmark._task_doc_example, dataset[\"latest\"]))\n",
    "\n",
    "len(full_dataset)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# using promopt from Meta"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "program = dspy.ChainOfThought(\n",
    "    benchmark.signature(\"Given the following question and candidate answers, choose the best answer.\") # put your initial system prompt here, or leave blank\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "print(\"Starting execution...\")\n",
    "# eval_subset_size = len(testset)\n",
    "score, results, all_scores = evaluate(\n",
    "    program,\n",
    "    devset=full_dataset,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Baseline Benchmark"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "BASE PROMPT:\n",
      " You are a helpful assistant designed to help with multiple choice question.\n",
      "CPU times: user 274 μs, sys: 7 μs, total: 281 μs\n",
      "Wall time: 238 μs\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "print(\"BASE PROMPT:\\n\", program.signature.instructions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "BEST EXAMPLES:\n",
      " []\n",
      "CPU times: user 272 μs, sys: 9 μs, total: 281 μs\n",
      "Wall time: 245 μs\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "print(\"BEST EXAMPLES:\\n\", program.demos)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "print(\"Starting execution...\")\n",
    "# eval_subset_size = len(testset)\n",
    "score, results, all_scores = evaluate(\n",
    "    program,\n",
    "    devset=full_dataset,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "results.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import csv\n",
    "\n",
    "with open(\"my_results.csv\", mode=\"w\", newline=\"\", encoding=\"utf-8\") as f:\n",
    "    writer = csv.writer(f)\n",
    "    # If `results` is a list of lists, just write rows:\n",
    "    for row in results:\n",
    "        writer.writerow(row)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.DataFrame([(i, str(pred), score) for i, (example, pred, score) in enumerate(results)],\n",
    "                     columns=['Example_Index', 'Prediction', 'Score'])\n",
    "print(\"\\nResults DataFrame:\")\n",
    "print(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "comparison_df = pd.DataFrame([{\n",
    "        'Question': example.inputs()['question'][:100] + '...' if len(example.inputs()['question']) > 100 else example.inputs()['question'],\n",
    "        'Predicted Answer': prediction.answer if hasattr(prediction, 'answer') else str(prediction),\n",
    "        'Correct Answer': example.answer,\n",
    "        'Is Correct': '✔️' if score else '❌'\n",
    "} for example, prediction, score in results])\n",
    "\n",
    "\n",
    "csv_filename = 'prediction_results.csv'\n",
    "comparison_df.to_csv(csv_filename, index=False, encoding='utf-8-sig')  # utf-8-sig to handle special characters\n",
    "print(f\"\\nResults saved to {csv_filename}\")\n",
    "    \n",
    "pd.set_option('display.max_colwidth', None)\n",
    "print(\"\\nPredictions vs Actual Answers:\")\n",
    "print(comparison_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "print(\"Starting execution...\")\n",
    "evaluate(\n",
    "    program,\n",
    "    devset=testset,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Optimize Subset + Evaluation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/29 14:31:07 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
      "RUNNING WITH THE FOLLOWING LIGHT AUTO RUN SETTINGS:\n",
      "num_trials: 7\n",
      "minibatch: False\n",
      "num_candidates: 5\n",
      "valset size: 20\n",
      "\n",
      "2025/01/29 14:31:07 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
      "==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==\n",
      "2025/01/29 14:31:07 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.\n",
      "\n",
      "2025/01/29 14:31:07 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=5 sets of demonstrations...\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapping set 1/5\n",
      "Bootstrapping set 2/5\n",
      "Bootstrapping set 3/5\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 25%|███████████▌                                  | 5/20 [00:00<00:00, 19.77it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 4 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.\n",
      "Bootstrapping set 4/5\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 25%|███████████▎                                 | 5/20 [00:00<00:00, 249.55it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 4 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.\n",
      "Bootstrapping set 5/5\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 10%|████▌                                        | 2/20 [00:00<00:00, 255.07it/s]\n",
      "2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
      "==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==\n",
      "2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.\n",
      "2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
      "Proposing instructions...\n",
      "\n",
      "2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:\n",
      "\n",
      "2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: 0: You are a helpful assistant designed to help with multiple choice question.\n",
      "\n",
      "2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: 1: To address the task effectively, I propose the following instruction: \"You are a critical thinking assistant tasked with solving a multiple-choice question. Given the question and a set of options, provide a step-by-step reasoning process that evaluates each option based on the information provided in the question, and then select the most appropriate answer. Ensure that your reasoning is transparent, logical, and based on the context of the question. Finally, clearly state the correct answer choice.\n",
      "\n",
      "2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: 2: To answer this multiple-choice question, carefully analyze the given options and the context of the question. Consider the potential consequences, mathematical relationships, or physical principles that apply. Provide step-by-step reasoning to determine the correct answer, ensuring that your thought process is logical and easy to follow. When solving mathematical equations, use mental math or algebraic manipulations as necessary. For questions involving physical concepts, apply relevant principles such as conservation laws or fundamental properties of systems. Always evaluate the options in light of the question's requirements, selecting the choice that best aligns with your reasoning.\n",
      "\n",
      "2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: 3: You are a critical thinking assistant, skilled in analyzing complex questions across various academic and professional fields. Your task is to carefully evaluate the given question, consider the provided options, and generate a step-by-step reasoning process to arrive at the most appropriate answer. Ensure your reasoning is clear, concise, and well-structured, allowing users to follow your thought process easily. Remember to address the question directly, evaluate each option based on its relevance and accuracy, and conclude with a well-supported answer.\n",
      "\n",
      "2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: 4: You are a critical care specialist in a high-stakes, time-sensitive environment, and it's crucial that you make accurate decisions quickly. You have a patient in front of you who is in need of immediate attention, and the only way to save their life is by answering a series of multiple-choice questions correctly. Each question will present a complex scenario, and you must use your expertise to choose the correct answer from the provided options. Your reasoning and decision-making skills will be put to the test as you work through each question. You must think step by step, analyzing the information given, and provide a clear explanation for your answer choice. The patient's life depends on your ability to answer these questions correctly, so it's essential that you remain focused and provide the most accurate response possible. Please answer the question to the best of your ability, providing a detailed reasoning process and selecting the correct answer from the options provided.\n",
      "\n",
      "2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
      "\n",
      "2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: Evaluating the default program...\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.\n",
      "Average Metric: 15.00 / 20 (75.0%): 100%|████████| 20/20 [00:00<00:00, 404.41it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/29 14:31:08 INFO dspy.evaluate.evaluate: Average Metric: 15 / 20 (75.0%)\n",
      "2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 75.0\n",
      "\n",
      "2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: ==> STEP 3: FINDING OPTIMAL PROMPT PARAMETERS <==\n",
      "2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: We will evaluate the program over a series of trials with different combinations of instructions and few-shot examples to find the optimal combination using Bayesian Optimization.\n",
      "\n",
      "/home/justinai/.conda/envs/prompt-migration/lib/python3.10/site-packages/optuna/_experimental.py:31: ExperimentalWarning: Argument ``multivariate`` is an experimental feature. The interface can change in the future.\n",
      "  warnings.warn(\n",
      "2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 1 / 7 =====\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 14.00 / 20 (70.0%): 100%|████████| 20/20 [00:00<00:00, 300.22it/s]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/29 14:31:08 INFO dspy.evaluate.evaluate: Average Metric: 14 / 20 (70.0%)\n",
      "2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 70.0 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 1'].\n",
      "2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [75.0, 70.0]\n",
      "2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 75.0\n",
      "2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: =======================\n",
      "\n",
      "\n",
      "2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 2 / 7 =====\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average Metric: 13.00 / 20 (65.0%): 100%|████████| 20/20 [00:00<00:00, 148.40it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/29 14:31:08 INFO dspy.evaluate.evaluate: Average Metric: 13 / 20 (65.0%)\n",
      "2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 65.0 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 1'].\n",
      "2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [75.0, 70.0, 65.0]\n",
      "2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 75.0\n",
      "2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: =======================\n",
      "\n",
      "\n",
      "2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 3 / 7 =====\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 16.00 / 20 (80.0%): 100%|████████| 20/20 [00:00<00:00, 357.67it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/29 14:31:08 INFO dspy.evaluate.evaluate: Average Metric: 16 / 20 (80.0%)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: \u001b[92mBest full score so far!\u001b[0m Score: 80.0\n",
      "2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 80.0 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 1'].\n",
      "2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [75.0, 70.0, 65.0, 80.0]\n",
      "2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 80.0\n",
      "2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: =======================\n",
      "\n",
      "\n",
      "2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 4 / 7 =====\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average Metric: 13.00 / 20 (65.0%): 100%|███████| 20/20 [00:00<00:00, 3125.76it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/29 14:31:08 INFO dspy.evaluate.evaluate: Average Metric: 13 / 20 (65.0%)\n",
      "2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 65.0 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 1'].\n",
      "2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [75.0, 70.0, 65.0, 80.0, 65.0]\n",
      "2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 80.0\n",
      "2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: =======================\n",
      "\n",
      "\n",
      "2025/01/29 14:31:08 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 5 / 7 =====\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 16.00 / 20 (80.0%): 100%|████████| 20/20 [00:00<00:00, 424.70it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/29 14:31:09 INFO dspy.evaluate.evaluate: Average Metric: 16 / 20 (80.0%)\n",
      "2025/01/29 14:31:09 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 80.0 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 3'].\n",
      "2025/01/29 14:31:09 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [75.0, 70.0, 65.0, 80.0, 65.0, 80.0]\n",
      "2025/01/29 14:31:09 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 80.0\n",
      "2025/01/29 14:31:09 INFO dspy.teleprompt.mipro_optimizer_v2: =======================\n",
      "\n",
      "\n",
      "2025/01/29 14:31:09 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 6 / 7 =====\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Average Metric: 12.00 / 20 (60.0%): 100%|████████| 20/20 [00:00<00:00, 306.69it/s]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/29 14:31:09 INFO dspy.evaluate.evaluate: Average Metric: 12 / 20 (60.0%)\n",
      "2025/01/29 14:31:09 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 60.0 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 1'].\n",
      "2025/01/29 14:31:09 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [75.0, 70.0, 65.0, 80.0, 65.0, 80.0, 60.0]\n",
      "2025/01/29 14:31:09 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 80.0\n",
      "2025/01/29 14:31:09 INFO dspy.teleprompt.mipro_optimizer_v2: =======================\n",
      "\n",
      "\n",
      "2025/01/29 14:31:09 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 7 / 7 =====\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Average Metric: 14.00 / 20 (70.0%): 100%|████████| 20/20 [00:00<00:00, 388.98it/s]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/29 14:31:09 INFO dspy.evaluate.evaluate: Average Metric: 14 / 20 (70.0%)\n",
      "2025/01/29 14:31:09 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 70.0 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 4'].\n",
      "2025/01/29 14:31:09 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [75.0, 70.0, 65.0, 80.0, 65.0, 80.0, 60.0, 70.0]\n",
      "2025/01/29 14:31:09 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 80.0\n",
      "2025/01/29 14:31:09 INFO dspy.teleprompt.mipro_optimizer_v2: =======================\n",
      "\n",
      "\n",
      "2025/01/29 14:31:09 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 80.0!\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1.43 s, sys: 328 ms, total: 1.76 s\n",
      "Wall time: 1.55 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "subset_size = 20\n",
    "optimizer = dspy.MIPROv2(\n",
    "    metric=benchmark.metric,\n",
    "    auto=\"light\",\n",
    "    num_threads=NUM_THREADS,\n",
    "    task_model=TASK_MODEL,\n",
    "    prompt_model=PROMPT_MODEL,\n",
    "    max_labeled_demos=FEW_SHOTS,\n",
    ")\n",
    "\n",
    "optimized_program = optimizer.compile(\n",
    "    program,\n",
    "    trainset=trainset[:subset_size],\n",
    "    valset=valset[:subset_size],\n",
    "    requires_permission_to_run=False,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "BEST PROMPT:\n",
      " You are a helpful assistant designed to help with multiple choice question.\n",
      "CPU times: user 252 μs, sys: 13 μs, total: 265 μs\n",
      "Wall time: 236 μs\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "print(\"BEST PROMPT:\\n\", optimized_program.signature.instructions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "BEST EXAMPLES:\n",
      " [Example({'question': 'Explain what difficulties would arise if messenger RNA molecules were not destroyed after they had produced some polypeptide chains.', 'options': {'A': 'mRNA would replicate rapidly', 'B': 'The cell would use mRNA as a source of energy', 'C': 'The cell would lack proteins', 'D': 'Cell would enter a state of permanent division', 'E': 'mRNA would be transformed into DNA', 'F': 'Excess protein production, energy depletion, and potential harm to the cell', 'G': 'mRNA would exit the cell and infect neighboring cells', 'H': 'Proteins would be broken down into mRNA', 'I': 'mRNA would become part of the cell membrane', 'J': 'mRNA would bind to lipids and carbohydrates, disrupting cellular metabolism'}, 'answer': 'F'}) (input_keys={'options', 'question'}), Example({'question': 'Based on the characteristic population curves that result from plotting population growth of a species, the most effective means of controlling the mosquito population is to', 'options': {'A': 'opt for zero population control once the K value of the curve has been reached', 'B': 'maintain the population at the highest point of its logistic curve', 'C': 'reduce the carrying capacity cif the environment to lower the K value', 'D': 'decrease the mortality rate', 'E': 'increase the birth rate of the species', 'F': 'drastically reduce the population below the K value', 'G': 'maintain the population at a point corresponding to the midpoint of its logistic curve', 'H': 'increase the carrying capacity of the environment to raise the K value', 'I': 'opt for zero population control at the beginning of the logistic curve', 'J': None}, 'answer': 'C'}) (input_keys={'options', 'question'}), Example({'question': 'Solve the equation 1.2 = 0.4y using mental math.', 'options': {'A': '3', 'B': '−3', 'C': '0.8', 'D': '2', 'E': '0.3', 'F': '5', 'G': '−4', 'H': '4', 'I': '6', 'J': '1.6'}, 'answer': 'A'}) (input_keys={'options', 'question'}), Example({'question': 'assume you are Indonesian. In 2010, the rupiah exchange rate was around IDR15,000/USD, and the consumer price index in Indonesia and the United States was at 100. In 2019, the exchange rate changed to IDR14,000/USD. Simultaneously, Indonesia’s inflation rose 5% due to the consumer price index rising to 105. Meanwhile, the United States’ inflation rate rose 10% due to the consumer price index rising to 110. Whats the real exchange rate?', 'options': {'A': '14000.00', 'B': '15500.00', 'C': '15000.00', 'D': '16000.00', 'E': '13500.00', 'F': '14666.67', 'G': '13888.89', 'H': '14800.00', 'I': '15200.00', 'J': '13333.33'}, 'answer': 'F'}) (input_keys={'options', 'question'}), Example({'question': 'To move the economy closer to full employment the central bank decides that the federal funds rate must be increased. The appropriate open market operation is to ______ which ______ the money supply ______ aggregate demand and fights ______. OMO \\xa0\\xa0\\xa0 MONEY SUPPLY \\xa0\\xa0\\xa0 AD \\xa0\\xa0\\xa0 TO FIGHT', 'options': {'A': 'Buy bonds \\xa0\\xa0\\xa0 Decreases \\xa0\\xa0\\xa0 Decreases \\xa0\\xa0\\xa0 Unemployment', 'B': 'Sell bonds \\xa0\\xa0\\xa0 Increases \\xa0\\xa0\\xa0 Decreases \\xa0\\xa0\\xa0 Unemployment', 'C': 'Buy bonds \\xa0\\xa0\\xa0 Increases \\xa0\\xa0\\xa0 Decreases \\xa0\\xa0\\xa0 Unemployment', 'D': 'Sell bonds \\xa0\\xa0\\xa0 Increases \\xa0\\xa0\\xa0 Decreases \\xa0\\xa0\\xa0 Inflation', 'E': 'Buy bonds \\xa0\\xa0\\xa0 Increases \\xa0\\xa0\\xa0 Increases \\xa0\\xa0\\xa0 Unemployment', 'F': 'Sell bonds \\xa0\\xa0\\xa0 Decreases \\xa0\\xa0\\xa0 Decreases \\xa0\\xa0\\xa0 Unemployment', 'G': 'Buy bonds \\xa0\\xa0\\xa0 Increases \\xa0\\xa0\\xa0 Increases \\xa0\\xa0\\xa0 Inflation', 'H': 'Sell bonds \\xa0\\xa0\\xa0 Decreases \\xa0\\xa0\\xa0 Decreases \\xa0\\xa0\\xa0 Inflation', 'I': 'Buy bonds \\xa0\\xa0\\xa0 Decreases \\xa0\\xa0\\xa0 Increases \\xa0\\xa0\\xa0 Inflation', 'J': 'Sell bonds \\xa0\\xa0\\xa0 Increases \\xa0\\xa0\\xa0 Increases \\xa0\\xa0\\xa0 Inflation'}, 'answer': 'H'}) (input_keys={'options', 'question'})]\n"
     ]
    }
   ],
   "source": [
    "print(\"BEST EXAMPLES:\\n\", optimized_program.demos)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "ename": "AttributeError",
     "evalue": "'Example' object has no attribute 'to_dict'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[15], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m examples_json \u001b[38;5;241m=\u001b[39m [example\u001b[38;5;241m.\u001b[39mto_dict() \u001b[38;5;28;01mfor\u001b[39;00m example \u001b[38;5;129;01min\u001b[39;00m optimized_program\u001b[38;5;241m.\u001b[39mdemos]\n\u001b[1;32m      2\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mBEST EXAMPLES:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m, json\u001b[38;5;241m.\u001b[39mdumps(examples_json, indent\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m2\u001b[39m))\n",
      "Cell \u001b[0;32mIn[15], line 1\u001b[0m, in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[0;32m----> 1\u001b[0m examples_json \u001b[38;5;241m=\u001b[39m [\u001b[43mexample\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mto_dict\u001b[49m() \u001b[38;5;28;01mfor\u001b[39;00m example \u001b[38;5;129;01min\u001b[39;00m optimized_program\u001b[38;5;241m.\u001b[39mdemos]\n\u001b[1;32m      2\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mBEST EXAMPLES:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m, json\u001b[38;5;241m.\u001b[39mdumps(examples_json, indent\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m2\u001b[39m))\n",
      "File \u001b[0;32m~/.conda/envs/prompt-migration/lib/python3.10/site-packages/dspy/primitives/example.py:24\u001b[0m, in \u001b[0;36mExample.__getattr__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m     22\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m key \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_store:\n\u001b[1;32m     23\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_store[key]\n\u001b[0;32m---> 24\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mAttributeError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(\u001b[38;5;28mself\u001b[39m)\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m object has no attribute \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
      "\u001b[0;31mAttributeError\u001b[0m: 'Example' object has no attribute 'to_dict'"
     ]
    }
   ],
   "source": [
    "examples_json = [example.to_dict() for example in optimized_program.demos]\n",
    "print(\"BEST EXAMPLES:\\n\", json.dumps(examples_json, indent=2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "%%time\n",
    "score, results, all_scores = evaluate(\n",
    "    optimized_program,\n",
    "    devset=testset,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Medium Optimization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2025/01/29 14:31:42 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
      "RUNNING WITH THE FOLLOWING MEDIUM AUTO RUN SETTINGS:\n",
      "num_trials: 25\n",
      "minibatch: True\n",
      "num_candidates: 19\n",
      "valset size: 200\n",
      "\n",
      "2025/01/29 14:31:42 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
      "==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==\n",
      "2025/01/29 14:31:42 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.\n",
      "\n",
      "2025/01/29 14:31:42 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=19 sets of demonstrations...\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapping set 1/19\n",
      "Bootstrapping set 2/19\n",
      "Bootstrapping set 3/19\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  1%|▍                                            | 5/500 [00:16<26:58,  3.27s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 4 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.\n",
      "Bootstrapping set 4/19\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  1%|▍                                          | 5/500 [00:47<1:17:43,  9.42s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 4 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.\n",
      "Bootstrapping set 5/19\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|▏                                            | 2/500 [00:10<44:44,  5.39s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.\n",
      "Bootstrapping set 6/19\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  1%|▍                                            | 5/500 [00:25<42:01,  5.09s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 3 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.\n",
      "Bootstrapping set 7/19\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  1%|▎                                          | 4/500 [00:40<1:24:42, 10.25s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 3 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.\n",
      "Bootstrapping set 8/19\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|▏                                            | 2/500 [00:10<45:15,  5.45s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.\n",
      "Bootstrapping set 9/19\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  1%|▌                                            | 6/500 [00:26<36:40,  4.45s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 3 full traces after 6 examples for up to 1 rounds, amounting to 6 attempts.\n",
      "Bootstrapping set 10/19\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  1%|▎                                          | 4/500 [00:35<1:12:42,  8.80s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 2 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.\n",
      "Bootstrapping set 11/19\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  1%|▎                                            | 3/500 [00:18<49:42,  6.00s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 2 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.\n",
      "Bootstrapping set 12/19\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|▏                                          | 2/500 [00:20<1:23:41, 10.08s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.\n",
      "Bootstrapping set 13/19\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  1%|▍                                            | 5/500 [00:24<41:05,  4.98s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bootstrapped 2 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.\n",
      "Bootstrapping set 14/19\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|                                                     | 0/500 [00:03<?, ?it/s]\n",
      "\n",
      "KeyboardInterrupt\n",
      "\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "subset_size = 500\n",
    "optimizer = dspy.MIPROv2(\n",
    "    metric=benchmark.metric,\n",
    "    auto=\"medium\",\n",
    "    num_threads=NUM_THREADS,\n",
    "    task_model=TASK_MODEL,\n",
    "    prompt_model=PROMPT_MODEL,\n",
    "    max_labeled_demos=FEW_SHOTS,\n",
    ")\n",
    "\n",
    "optimized_program = optimizer.compile(\n",
    "    program,\n",
    "    trainset=trainset[:subset_size],\n",
    "    valset=valset[:subset_size],\n",
    "    requires_permission_to_run=False,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"BEST PROMPT:\\n\", optimized_program.predict.signature.instructions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"BEST EXAMPLES:\\n\", optimized_program.demos)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "examples_json = [example.to_dict() for example in optimized_program.predict.demos]\n",
    "print(\"BEST EXAMPLES:\\n\", json.dumps(examples_json, indent=2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "%%time\n",
    "score, results, all_scores = evaluate(\n",
    "    optimized_program,\n",
    "    devset=testset,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Heavy Optimization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "%%time\n",
    "optimizer = dspy.MIPROv2(\n",
    "    metric=benchmark.metric,\n",
    "    auto=\"heavy\",\n",
    "    num_threads=NUM_THREADS,\n",
    "    task_model=TASK_MODEL,\n",
    "    prompt_model=PROMPT_MODEL,\n",
    "    max_labeled_demos=FEW_SHOTS,\n",
    ")\n",
    "\n",
    "optimized_program = optimizer.compile(\n",
    "    program,\n",
    "    trainset=trainset,\n",
    "    valset=valset,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"BEST PROMPT:\\n\", optimized_program.predict.signature.instructions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"BEST EXAMPLES:\\n\", optimized_program.predict.demos)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "score, results, all_scores = evaluate(\n",
    "    optimized_program,\n",
    "    devset=testset,\n",
    "    display_table=False,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}