### Imports & Env Setup

In [1]:
%reload_ext autoreload
%autoreload 2
import sys
import os
from dotenv import load_dotenv
load_dotenv()

import dspy
sys.path.append(os.path.abspath('../'))
from benchmarks import llama_mmlu_pro, leaderboard_mmlu_pro

* 'fields' has been removed


### Configuration

In [2]:
NUM_THREADS = 16

FEW_SHOTS = 5

# See https://docs.litellm.ai/docs/providers/vllm for details
TASK_MODEL = dspy.LM(
 "hosted_vllm/meta-llama/Llama-3.3-70B-Instruct",
 api_base = 'http://localhost:8000/v1' , # or api_base ?
 # api_version: Optional[str] = None,
 api_key = "",
 # seed: Optional[int] = None,
 # max_tokens: Optional[int] = None,
 # timeout: Optional[Union[float, int]] = None,
)
PROMPT_MODEL = dspy.LM(
 "hosted_vllm/meta-llama/Llama-3.3-70B-Instruct",
 api_base = 'http://localhost:8000/v1', # or api_base ?
 # api_version: Optional[str] = None,
 api_key = "",
 # seed: Optional[int] = None,
 # max_tokens: Optional[int] = None,
 # timeout: Optional[Union[float, int]] = None,
)

dspy.configure(lm=TASK_MODEL)

# replace this with llama_mmlu_pro or whatever
benchmark = llama_mmlu_pro

# Without chain of thought:
# program = dspy.Predict(
# benchmark.signature("")
# )

# With chain of thought:
program = dspy.ChainOfThought(
 benchmark.signature("You are a helpful assistant designed to help with multiple choice question.") # put your initial system prompt here, or leave blank
)

evaluate = dspy.Evaluate(
 devset=[],
 metric=benchmark.metric,
 num_threads=NUM_THREADS,
 display_progress=True,
 display_table=True,
 return_all_scores=True,
 return_outputs=True,
)

### Load dataset

In [3]:
trainset, valset, testset = benchmark.datasets(
 train_size=0.1,
 validation_size=0.2,
)

len(trainset), len(valset), len(testset)

(1197, 2156, 8626)

### Baseline Benchmark

In [4]:
print("BASE PROMPT:\n", program.predict.signature.instructions)

BASE PROMPT:
 You are a helpful assistant designed to help with multiple choice question.


In [1]:
!export HOSTED_VLLM_API_KEY=""

In [14]:
import os 

os.environ["HOSTED_VLLM_API_KEY"]=""

In [8]:
eval_subset_size = len(testset)
evaluate(
 program,
 devset=testset[:eval_subset_size],
)

2025/01/21 15:44:46 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'The spontaneous fission activity rate of U-238 is 6.7 fissions/kg s. A sample of shale contains 0.055% U-238 by weight. Calculate the number of spontaneous fissions in one day in a 106-kg pile of the shale by determining the number of fissions.', 'options': {'A': '400000000.0', 'B': '600000000.0', 'C': '50000000.0', 'D': '250000000.0', 'E': '100000000.0', 'F': '200000000.0', 'G': '700000000.0', 'H': '450000000.0', 'I': '150000000.0', 'J': '320000000.0'}, 'answer': 'J'}) (input_keys={'options', 'question'}): Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']). Set `provide_traceback=True` to see the stack trace.


Average Metric: 638.00 / 848 (75.2%): 10%|████████ | 852/8626 [19:30<2:57:55, 1.37s/it]

2025/01/21 15:44:46 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'Compute $\\dbinom{85}{82}$.', 'options': {'A': '102560', 'B': '252', 'C': '101170', 'D': '100890', 'E': '88440', 'F': '78960', 'G': '98770', 'H': '110870', 'I': '4680', 'J': '1254'}, 'answer': ''}) (input_keys={'options', 'question'}): 'list' object has no attribute 'items'. Set `provide_traceback=True` to see the stack trace.
2025/01/21 15:44:46 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'A sleeve-coupling is used on a shaft 1(1/2) in. in diameter, delivering a torque of 500 lbs.-in. Calculate the diameter of the pin required to hold the coupling if the design stress for the pin material in shear is 15,000 psi.', 'options': {'A': '0.145 in.', 'B': '0.125 in.', 'C': '0.210 in.', 'D': '0.155 in.', 'E': '0.162 in.', 'F': '0.168 in.', 'G': '0.190 in.', 'H': '0.158 in.', 'I': '0.175 in.', 'J': '0.182 in.'}, 'answer': ''}) (input_keys={'options', 'question'}): Expect




2025/01/21 15:44:46 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'A 10 foot long chain is placed on a 4 foot high, frictionless table so that one end just reaches the floor. With what velocity will the other end slide off the table?', 'options': {'A': '13.6 ft. / sec.', 'B': '9.8 ft. / sec.', 'C': '12.8 ft. / sec.', 'D': '10.2 ft. / sec.', 'E': '6.7 ft. / sec.', 'F': '17.1 ft. / sec.', 'G': '18.3 ft. / sec.', 'H': '15.4 ft. / sec.', 'I': '20.4 ft. / sec.', 'J': '14.7 ft. / sec.'}, 'answer': 'B'}) (input_keys={'options', 'question'}): Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']). Set `provide_traceback=True` to see the stack trace.


Average Metric: 251.00 / 329 (76.3%): 4%|███ | 328/8626 [00:00<00:05, 1592.43it/s]

AttributeError: 'list' object has no attribute 'items'

Average Metric: 251.00 / 329 (76.3%): 4%|███ | 329/8626 [00:10<00:05, 1592.43it/s]

### Optimize Subset + Evaluation

In [None]:
subset_size = 20
optimizer = dspy.MIPROv2(
 metric=benchmark.metric,
 auto="light",
 num_threads=NUM_THREADS,
 task_model=TASK_MODEL,
 prompt_model=PROMPT_MODEL,
 max_labeled_demos=FEW_SHOTS,
)

optimized_program = optimizer.compile(
 program,
 trainset=trainset[:subset_size],
 valset=valset[:subset_size],
 requires_permission_to_run=False,
)

In [None]:
print("BEST PROMPT:\n", optimized_program.predict.signature.instructions)

In [None]:
eval_subset_size = 200
score, results, all_scores = evaluate(
 optimized_program,
 devset=testset[:eval_subset_size],
)

## Medium Optimization

In [None]:
subset_size = 500
optimizer = dspy.MIPROv2(
 metric=benchmark.metric,
 auto="medium",
 num_threads=NUM_THREADS,
 task_model=TASK_MODEL,
 prompt_model=PROMPT_MODEL,
 max_labeled_demos=FEW_SHOTS,
)

optimized_program = optimizer.compile(
 program,
 trainset=trainset[:subset_size],
 valset=valset[:subset_size],
 requires_permission_to_run=False,
)

In [None]:
print("BEST PROMPT:\n", optimized_program.predict.signature.instructions)

In [None]:
print("BEST PROMPT:\n", optimized_program.predict.signature.instructions)

In [None]:
print("BEST EXAMPLES:\n", optimized_program.predict.demos)

In [None]:
score, results, all_scores = evaluate(
 optimized_program,
 devset=testset[:subset_size],
 display_table=False,
)

In [None]:
eval_medium_subset_size = 300
score, results, all_scores = evaluate(
 optimized_program,
 devset=testset[:eval_medium_subset_size],
)

## Heavy Optimization

In [None]:
optimizer = dspy.MIPROv2(
 metric=benchmark.metric,
 auto="heavy",
 num_threads=NUM_THREADS,
 task_model=TASK_MODEL,
 prompt_model=PROMPT_MODEL,
 max_labeled_demos=FEW_SHOTS,
)

optimized_program = optimizer.compile(
 program,
 trainset=trainset,
 valset=valset,
)

In [None]:
print("BEST PROMPT:\n", optimized_program.predict.signature.instructions)

In [None]:
score, results, all_scores = evaluate(
 optimized_program,
 devset=testset,
 display_table=False,
)