### Imports & Env Setup

In [1]:
%reload_ext autoreload
%autoreload 2
import sys
import os
from dotenv import load_dotenv
load_dotenv()

import dspy
sys.path.append(os.path.abspath('../'))
from benchmarks import llama_mmlu_pro, leaderboard_mmlu_pro

* 'fields' has been removed


### Configuration

In [2]:
NUM_THREADS = 16

FEW_SHOTS = 5

# See https://docs.litellm.ai/docs/providers/vllm for details
TASK_MODEL = dspy.LM(
 "hosted_vllm/meta-llama/Llama-3.3-70B-Instruct",
 api_base = 'http://localhost:8000/v1' , # or api_base ?
 # api_version: Optional[str] = None,
 # api_key: Optional[str] = None,
 # seed: Optional[int] = None,
 # max_tokens: Optional[int] = None,
 # timeout: Optional[Union[float, int]] = None,
)
PROMPT_MODEL = dspy.LM(
 "hosted_vllm/meta-llama/Llama-3.3-70B-Instruct",
 api_base = 'http://localhost:8000/v1', # or api_base ?
 # api_version: Optional[str] = None,
 # api_key: Optional[str] = None,
 # seed: Optional[int] = None,
 # max_tokens: Optional[int] = None,
 # timeout: Optional[Union[float, int]] = None,
)

dspy.configure(lm=TASK_MODEL)

# replace this with llama_mmlu_pro or whatever
benchmark = llama_mmlu_pro

# Without chain of thought:
# program = dspy.Predict(
# benchmark.signature("")
# )

# With chain of thought:
program = dspy.ChainOfThought(
 benchmark.signature("You are a helpful assistant designed to help with multiple choice question. Always return a JSON object with the following format:\n"
 "{\n"
 ' "reasoning": "Step-by-step reasoning here.",\n'
 ' "answer": "Final answer (A, B, C, etc.)"\n'
 "}\n"
 "Do NOT return plain text. Only return a valid JSON object with these keys.") # put your initial system prompt here, or leave blank
)

evaluate = dspy.Evaluate(
 devset=[],
 metric=benchmark.metric,
 num_threads=NUM_THREADS,
 display_progress=True,
 display_table=True,
 return_all_scores=True,
 return_outputs=True,
)

### Load dataset

In [3]:
trainset, valset, testset = benchmark.datasets(
 train_size=0.1,
 validation_size=0.2,
)

len(trainset), len(valset), len(testset)

(1197, 2156, 8626)

### Baseline Benchmark

In [4]:
%%time
print("BASE PROMPT:\n", program.predict.signature.instructions)

BASE PROMPT:
 Multiple choice question answering with reasoning.
CPU times: user 170 μs, sys: 21 μs, total: 191 μs
Wall time: 171 μs


In [None]:
%%time

print("Starting execution...")
eval_subset_size = len(testset)
evaluate(
 program,
 devset=testset[:eval_subset_size],
)

Starting execution...
Average Metric: 0.00 / 101 (0.0%): 1%|▉ | 101/8626 [00:57<1:31:30, 1.55it/s]

2025/01/22 00:51:43 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'A compressible gas flows over a flat plate. Properties of the gasare closely similar to those of air. The flow is at a temperatureand pressure of 700°F and 30psia, respectively. The plate is 1 in. in length and is assumed to beperfectly insulated. If the gas is moving at a speed of 500 ft/sec, calculate the surface temperature of the plate. (Note: the speed is too great to neglect the effects of viscous dissipation.)', 'options': {'A': '700.5°F', 'B': '700°F', 'C': '780°F', 'D': '772°F', 'E': '735°F', 'F': '800°F', 'G': '750°F', 'H': '685°F', 'I': '716.25°F', 'J': '810°F'}, 'reasoning': '', 'answer': 'I'}) (input_keys={'question', 'options'}): Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']). Set `provide_traceback=True` to see the stack trace.


Average Metric: 0.00 / 116 (0.0%): 1%|█▏ | 116/8626 [01:08<1:13:52, 1.92it/s]

2025/01/22 00:51:53 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'A mass $m$ moves in one dimension and is subject to a constant force $+F_0$ when $x<0$ and to a constant force $-F_0$ when $x>0$. Describe the motion by constructing a phase diagram. Calculate the period of the motion in terms of $m, F_0$, and the amplitude $A$ (disregard damping) .', 'options': {'A': '2 $\\sqrt{\\frac{m A}{F_0}}$', 'B': '6 $\\sqrt{\\frac{2 m A}{F_0}}$', 'C': '4 $\\sqrt{\\frac{m A}{F_0}}$', 'D': '2 $\\sqrt{\\frac{2 m A}{F_0}}$', 'E': '$\\pi \\sqrt{\\frac{2 m A}{F_0}}$', 'F': '$\\sqrt{\\frac{8 m A}{F_0}}$', 'G': '4 $\\sqrt{\\frac{m A}{2 F_0}}$', 'H': '$\\sqrt{\\frac{m A}{2 F_0}}$', 'I': ' 4 $\\sqrt{\\frac{2 m A}{F_0}}$', 'J': '$\\sqrt{\\frac{4 m A}{F_0}}$'}, 'reasoning': '', 'answer': 'D'}) (input_keys={'question', 'options'}): Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']). Set `provide_traceback=True` to see the stack trace.


Average Metric: 0.00 / 121 (0.0%): 1%|█▏ | 123/8626 [01:11<1:49:26, 1.29it/s]

2025/01/22 00:51:57 ERROR dspy.utils.parallelizer: Error processing item Example({'question': '(a) 10 annual mortgage payments of $1,000, (b) 12 monthly payments of $100 on his automobile, (c) a bill for $2,000 due in two years, (d) a bill for $1,000 due today. Using an annual interest rate of 12 percent (nominal rate on the automobile loan and effective rate on all other debts), determine the annual amount necessary to retire the entire debt in 15 years.', 'options': {'A': '$1,225.00', 'B': '$1,500.00', 'C': '$1,580.42', 'D': '$1,250.00', 'E': '$1,450.00', 'F': '$1,375.69', 'G': '$1,600.00', 'H': '$1,700.00', 'I': '$1,305.75', 'J': '$1,520.34'}, 'reasoning': '', 'answer': 'D'}) (input_keys={'question', 'options'}): Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']). Set `provide_traceback=True` to see the stack trace.


Average Metric: 0.00 / 143 (0.0%): 2%|█▍ | 146/8626 [01:24<1:19:06, 1.79it/s]



### Optimize Subset + Evaluation

In [6]:
%%time
subset_size = 20
optimizer = dspy.MIPROv2(
 metric=benchmark.metric,
 auto="light",
 num_threads=NUM_THREADS,
 task_model=TASK_MODEL,
 prompt_model=PROMPT_MODEL,
 max_labeled_demos=FEW_SHOTS,
)

optimized_program = optimizer.compile(
 program,
 trainset=trainset[:subset_size],
 valset=valset[:subset_size],
 requires_permission_to_run=False,
)

2025/01/21 15:46:14 INFO dspy.teleprompt.mipro_optimizer_v2: 
RUNNING WITH THE FOLLOWING LIGHT AUTO RUN SETTINGS:
num_trials: 7
minibatch: False
num_candidates: 5
valset size: 20

2025/01/21 15:46:14 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2025/01/21 15:46:14 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2025/01/21 15:46:14 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=5 sets of demonstrations...


Bootstrapping set 1/5
Bootstrapping set 2/5
Bootstrapping set 3/5


 25%|███████████████████████████████▎ | 5/20 [00:19<00:59, 3.98s/it]


Bootstrapped 4 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.
Bootstrapping set 4/5


 20%|█████████████████████████ | 4/20 [00:34<02:17, 8.61s/it]


Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 5/5


 30%|█████████████████████████████████████▌ | 6/20 [00:39<01:31, 6.52s/it]
2025/01/21 15:47:48 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==
2025/01/21 15:47:48 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.


Bootstrapped 2 full traces after 6 examples for up to 1 rounds, amounting to 6 attempts.


2025/01/21 15:48:03 INFO dspy.teleprompt.mipro_optimizer_v2: 
Proposing instructions...

2025/01/21 15:48:30 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:

2025/01/21 15:48:30 INFO dspy.teleprompt.mipro_optimizer_v2: 0: You are a helpful assistant designed to help with multiple choice question.

2025/01/21 15:48:30 INFO dspy.teleprompt.mipro_optimizer_v2: 1: You are a helpful assistant designed to help with multiple choice questions from various domains, including physics, mathematics, biology, economics, law, and social sciences. Given a question and a set of options, please provide a step-by-step reasoning process to arrive at the correct answer, and then select the correct answer from the options provided. Your response should include a detailed explanation of your thought process, ensuring that each step is clearly described and logically connected to the next, ultimately leading to the selection of the correct answer.

2025/01/21 15:48:30 INFO dsp

Average Metric: 15.00 / 20 (75.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:21<00:00, 1.09s/it]

2025/01/21 15:48:52 INFO dspy.evaluate.evaluate: Average Metric: 15 / 20 (75.0%)
2025/01/21 15:48:52 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 75.0

2025/01/21 15:48:52 INFO dspy.teleprompt.mipro_optimizer_v2: ==> STEP 3: FINDING OPTIMAL PROMPT PARAMETERS <==
2025/01/21 15:48:52 INFO dspy.teleprompt.mipro_optimizer_v2: We will evaluate the program over a series of trials with different combinations of instructions and few-shot examples to find the optimal combination using Bayesian Optimization.

2025/01/21 15:48:52 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 1 / 7 =====



Average Metric: 14.00 / 20 (70.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:15<00:00, 1.30it/s]

2025/01/21 15:49:08 INFO dspy.evaluate.evaluate: Average Metric: 14 / 20 (70.0%)
2025/01/21 15:49:08 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 70.0 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 1'].
2025/01/21 15:49:08 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [75.0, 70.0]
2025/01/21 15:49:08 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 75.0


2025/01/21 15:49:08 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 2 / 7 =====



Average Metric: 15.00 / 20 (75.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:15<00:00, 1.31it/s]

2025/01/21 15:49:23 INFO dspy.evaluate.evaluate: Average Metric: 15 / 20 (75.0%)
2025/01/21 15:49:23 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 75.0 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 1'].
2025/01/21 15:49:23 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [75.0, 70.0, 75.0]
2025/01/21 15:49:23 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 75.0


2025/01/21 15:49:23 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 3 / 7 =====



Average Metric: 15.00 / 20 (75.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:13<00:00, 1.45it/s]

2025/01/21 15:49:37 INFO dspy.evaluate.evaluate: Average Metric: 15 / 20 (75.0%)
2025/01/21 15:49:37 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 75.0 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 1'].
2025/01/21 15:49:37 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [75.0, 70.0, 75.0, 75.0]
2025/01/21 15:49:37 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 75.0


2025/01/21 15:49:37 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 4 / 7 =====



Average Metric: 15.00 / 20 (75.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 3087.23it/s]

2025/01/21 15:49:37 INFO dspy.evaluate.evaluate: Average Metric: 15 / 20 (75.0%)
2025/01/21 15:49:37 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 75.0 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 1'].
2025/01/21 15:49:37 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [75.0, 70.0, 75.0, 75.0, 75.0]
2025/01/21 15:49:37 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 75.0


2025/01/21 15:49:37 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 5 / 7 =====



Average Metric: 16.00 / 20 (80.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:15<00:00, 1.31it/s]

2025/01/21 15:49:53 INFO dspy.evaluate.evaluate: Average Metric: 16 / 20 (80.0%)
2025/01/21 15:49:53 INFO dspy.teleprompt.mipro_optimizer_v2: [92mBest full score so far![0m Score: 80.0
2025/01/21 15:49:53 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 80.0 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 3'].
2025/01/21 15:49:53 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [75.0, 70.0, 75.0, 75.0, 75.0, 80.0]
2025/01/21 15:49:53 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 80.0


2025/01/21 15:49:53 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 6 / 7 =====



Average Metric: 14.00 / 20 (70.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:15<00:00, 1.30it/s]

2025/01/21 15:50:08 INFO dspy.evaluate.evaluate: Average Metric: 14 / 20 (70.0%)
2025/01/21 15:50:08 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 70.0 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 1'].
2025/01/21 15:50:08 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [75.0, 70.0, 75.0, 75.0, 75.0, 80.0, 70.0]
2025/01/21 15:50:08 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 80.0


2025/01/21 15:50:08 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 7 / 7 =====



Average Metric: 14.00 / 20 (70.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:13<00:00, 1.45it/s]

2025/01/21 15:50:22 INFO dspy.evaluate.evaluate: Average Metric: 14 / 20 (70.0%)
2025/01/21 15:50:22 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 70.0 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 4'].
2025/01/21 15:50:22 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [75.0, 70.0, 75.0, 75.0, 75.0, 80.0, 70.0, 70.0]
2025/01/21 15:50:22 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 80.0


2025/01/21 15:50:22 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 80.0!





In [10]:
%%time
print("BEST PROMPT:\n", optimized_program.predict.signature.instructions)

BEST PROMPT:
 You are a highly skilled expert witness in a high-stakes courtroom trial, and your task is to analyze complex multiple-choice questions and provide step-by-step reasoning to support your answer. The questions will cover a wide range of subjects, including physics, mathematics, biology, economics, law, and social sciences. Your goal is to think critically and arrive at the correct answer, while also providing a clear and coherent explanation of your thought process. The jury is counting on your expertise to make an informed decision, and the outcome of the trial hangs in the balance. Please respond with the correct answer and your reasoning.


In [11]:
%%time
eval_subset_size = len(testset)
score, results, all_scores = evaluate(
 optimized_program,
 devset=testset[:eval_subset_size],
)

Average Metric: 417.00 / 570 (73.2%): 7%|█████▏ | 574/8626 [6:54:48<96:58:50, 43.36s/it]
Average Metric: 218.00 / 286 (76.2%): 3%|██▊ | 286/8626 [02:32<59:39, 2.33it/s]

2025/01/21 23:37:28 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'Consider a thoroughly mixed vessel where a salt is dis-solved inwater. The volume of the fresh water initially in the tank is 100lbm.The inlet conditions are,ṁ_w= 150 lb/hr. and ṁ_s= 30 lb/hr. The resulting solution leaves at a rate of 120 lb/hr. If the flow in and out remain constant, compute the outletconcentration after one hour.', 'options': {'A': '0.86', 'B': '0.76', 'C': '0.46', 'D': '0.16', 'E': '0.06', 'F': '0.26', 'G': '0.96', 'H': '0.56', 'I': '0.36', 'J': '0.66'}, 'answer': ''}) (input_keys={'options', 'question'}): Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']). Set `provide_traceback=True` to see the stack trace.


Average Metric: 452.00 / 604 (74.8%): 7%|█████▋ | 604/8626 [05:18<1:17:58, 1.71it/s]

2025/01/21 23:40:13 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'Two identical conducting spheres, fixed in place, attract each other with an electrostatic force of $0.108 \\mathrm{~N}$ when their center-to-center separation is $50.0 \\mathrm{~cm}$. The spheres are then connected by a thin conducting wire. When the wire is removed, the spheres repel each other with an electrostatic force of $0.0360 \\mathrm{~N}$. Of the initial charges on the spheres, with a positive net charge, what was (a) the negative charge on one of them?', 'options': {'A': '$-2.00 \\mu \\mathrm{C}$', 'B': '$-2.50 \\mu \\mathrm{C}$', 'C': '$-0.50 \\mu \\mathrm{C}$', 'D': '$-1.75 \\mu \\mathrm{C}$', 'E': '$-0.75 \\mu \\mathrm{C}$', 'F': ' $-1.00 \\mu \\mathrm{C}$$ \\mu \\mathrm{C}$', 'G': '$-0.25 \\mu \\mathrm{C}$', 'H': '$-1.50 \\mu \\mathrm{C}$', 'I': '$-3.00 \\mu \\mathrm{C}$', 'J': '$-1.25 \\mu \\mathrm{C}$'}, 'answer': ''}) (input_keys={'options', 'question'}): Expected dict_keys(

Average Metric: 1075.00 / 1439 (74.7%): 17%|█████████████▎ | 1441/8626 [12:30<51:52, 2.31it/s]

2025/01/21 23:47:24 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'Assume that all gases are perfect and that data refer to 298 K unless otherwise stated. Find an expression for the fugacity coefficient of a gas that obeys the equation of state $p V_{\\mathrm{m}}=R T\\left(1+B / V_{\\mathrm{m}}+C / V_{\\mathrm{m}}^2\\right)$. Use the resulting expression to estimate the fugacity of argon at 1.00 atm and $100 \\mathrm{~K}$ using $B=-21.13 \\mathrm{~cm}^3 \\mathrm{~mol}^{-1}$ and $C=1054 \\mathrm{~cm}^6 \\mathrm{~mol}^{-2}$.', 'options': {'A': '1.0236$\\text{atm}$', 'B': '1.0567 atm', 'C': '0.9852 atm', 'D': ' 0.9974$\\text{atm}$ ', 'E': '0.9321 atm', 'F': '1.0000 atm', 'G': '1.0150 atm', 'H': '0.9125$\\text{atm}$', 'I': '1.1024$\\text{atm}$', 'J': '0.9500 atm'}, 'answer': ''}) (input_keys={'options', 'question'}): 'list' object has no attribute 'items'. Set `provide_traceback=True` to see the stack trace.


Average Metric: 1319.00 / 1758 (75.0%): 20%|████████████████▎ | 1761/8626 [15:12<57:12, 2.00it/s]

2025/01/21 23:50:08 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'The ${ }^7 \\mathrm{Li}^1 \\mathrm{H}$ ground electronic state has $D_0=2.4287 \\mathrm{eV}, \\nu_e / c=1405.65 \\mathrm{~cm}^{-1}$, and $\\nu_e x_e / c=23.20 \\mathrm{~cm}^{-1}$, where $c$ is the speed of light. (These last two quantities are usually designated $\\omega_e$ and $\\omega_e x_e$ in the literature.) Calculate $D_e$ for ${ }^7 \\mathrm{Li}^1 \\mathrm{H}$.', 'options': {'A': '2.4000 eV', 'B': '2.6000 $\\mathrm{eV}$', 'C': '2.5500 eV', 'D': '2.3000 $\\mathrm{eV}$', 'E': '2.4287 $\\mathrm{eV}$', 'F': '2.3500 eV', 'G': ' 2.5151 $\\mathrm{eV}$', 'H': '2.4500 eV', 'I': '2.4850 eV', 'J': '2.5350 eV'}, 'answer': 'G'}) (input_keys={'options', 'question'}): 'list' object has no attribute 'items'. Set `provide_traceback=True` to see the stack trace.


Average Metric: 1353.00 / 1805 (75.0%): 21%|████████████████▊ | 1809/8626 [15:38<54:42, 2.08it/s]

ValueError: Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning'])

## Medium Optimization

In [None]:
%%time
subset_size = 500
optimizer = dspy.MIPROv2(
 metric=benchmark.metric,
 auto="medium",
 num_threads=NUM_THREADS,
 task_model=TASK_MODEL,
 prompt_model=PROMPT_MODEL,
 max_labeled_demos=FEW_SHOTS,
)

optimized_program = optimizer.compile(
 program,
 trainset=trainset[:subset_size],
 valset=valset[:subset_size],
 requires_permission_to_run=False,
)

In [None]:
print("BEST PROMPT:\n", optimized_program.predict.signature.instructions)

In [None]:
print("BEST PROMPT:\n", optimized_program.predict.signature.instructions)

In [None]:
print("BEST EXAMPLES:\n", optimized_program.predict.demos)

In [None]:
%%time
score, results, all_scores = evaluate(
 optimized_program,
 devset=testset[:subset_size],
 display_table=False,
)

In [None]:
%%time
eval_medium_subset_size = 300
score, results, all_scores = evaluate(
 optimized_program,
 devset=testset[:eval_medium_subset_size],
)

## Heavy Optimization

In [None]:
optimizer = dspy.MIPROv2(
 metric=benchmark.metric,
 auto="heavy",
 num_threads=NUM_THREADS,
 task_model=TASK_MODEL,
 prompt_model=PROMPT_MODEL,
 max_labeled_demos=FEW_SHOTS,
)

optimized_program = optimizer.compile(
 program,
 trainset=trainset,
 valset=valset,
)

In [None]:
print("BEST PROMPT:\n", optimized_program.predict.signature.instructions)

In [None]:
score, results, all_scores = evaluate(
 optimized_program,
 devset=testset,
 display_table=False,
)