{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Imports & Env Setup" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "%reload_ext autoreload\n", "%autoreload 2\n", "import sys\n", "import os\n", "from dotenv import load_dotenv\n", "load_dotenv()\n", "from datasets import load_dataset\n", "\n", "\n", "import dspy\n", "sys.path.append(os.path.abspath('../'))\n", "from benchmarks import llama_mmlu_pro, leaderboard_mmlu_pro, llama_mmlu" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Configuration" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "NUM_THREADS = 48\n", "\n", "FEW_SHOTS = 5\n", "\n", "# See https://docs.litellm.ai/docs/providers/vllm for details\n", "TASK_MODEL = dspy.LM(\n", " \"hosted_vllm/meta-llama/Llama-3.3-70B-Instruct\",\n", " api_base = 'http://localhost:8000/v1' , # or api_base ?\n", " api_key = \"dummy\",\n", " # api_version: Optional[str] = None,\n", " # api_key: Optional[str] = None,\n", " # seed: Optional[int] = None,\n", " # max_tokens: Optional[int] = None,\n", " # timeout: Optional[Union[float, int]] = None,\n", ")\n", "PROMPT_MODEL = dspy.LM(\n", " \"hosted_vllm/meta-llama/Llama-3.3-70B-Instruct\",\n", " api_base = 'http://localhost:8000/v1', # or api_base ?\n", " api_key = \"dummy\",\n", "\n", " # api_version: Optional[str] = None,\n", " # api_key: Optional[str] = None,\n", " # seed: Optional[int] = None,\n", " # max_tokens: Optional[int] = None,\n", " # timeout: Optional[Union[float, int]] = None,\n", ")\n", "\n", "dspy.configure(lm=TASK_MODEL)\n", "\n", "# replace this with llama_mmlu_pro or whatever\n", "benchmark = llama_mmlu\n", "\n", "# Without chain of thought:\n", "# program = dspy.Predict(\n", "# benchmark.signature(\"\")\n", "# )\n", "\n", "# With chain of thought:\n", "program = dspy.ChainOfThought(\n", " benchmark.signature(\"You are a helpful assistant.\") # put your initial system prompt here, or leave blank\n", ")\n", "\n", "evaluate = dspy.Evaluate(\n", " devset=[],\n", " max_errors = 500,\n", " metric=benchmark.metric,\n", " num_threads=NUM_THREADS,\n", " display_progress=True,\n", " display_table=True,\n", " return_all_scores=True,\n", " return_outputs=True,\n", " provide_traceback=True\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load dataset" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1403, 1263, 11369)" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "trainset, valset, testset = benchmark.datasets(\n", " train_size=0.1,\n", " validation_size=0.1,\n", ")\n", "\n", "len(trainset), len(valset), len(testset)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Baseline Benchmark" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "BASE PROMPT:\n", " You are a helpful assistant.\n", "CPU times: user 270 μs, sys: 7 μs, total: 277 μs\n", "Wall time: 231 μs\n" ] } ], "source": [ "%%time\n", "print(\"BASE PROMPT:\\n\", program.signature.instructions)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "BEST EXAMPLES:\n", " []\n", "CPU times: user 107 μs, sys: 0 ns, total: 107 μs\n", "Wall time: 110 μs\n" ] } ], "source": [ "%%time\n", "print(\"BEST EXAMPLES:\\n\", program.demos)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Starting execution...\n", "Average Metric: 35.00 / 40 (87.5%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:21<00:00, 1.84it/s]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2025/02/03 10:43:12 INFO dspy.evaluate.evaluate: Average Metric: 35 / 40 (87.5%)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
questionoptionsexample_answerreasoningpred_answermetric
0The first hominids could be described as:{'A': 'quadrupedal knappers.', 'B': 'quadrupedal pongids.', 'C': '...CThe first hominids are known for their transition from a quadruped...C✔️ [True]
1Investment demand most likely increases when{'A': 'real GDP decreases.', 'B': 'the cost of acquiring and maint...CInvestment demand is influenced by several factors, including expe...C✔️ [True]
2Which of the following is a legitimate threat to societal security?{'A': 'Political policies designed to alter demographic characteri...BTo determine a legitimate threat to societal security, we must con...B✔️ [True]
3Long-run aggregate supply is most likely to increase as the result of{'A': 'an increase in the real interest rate', 'B': 'increased inv...BThe long-run aggregate supply (LRAS) curve represents the total am...B✔️ [True]
4As of 2013, share of people in the India who think political parti...{'A': '26%', 'B': '46%', 'C': '66%', 'D': '86%'}DTo answer this question, we need to consider the available data an...D✔️ [True]
5This question refers to the following information. \"I travelled th...{'A': 'The ability of commerce to foster cultural diffusion', 'B':...AThe journal entry by Ibn Battuta describes the city of 'Aden as a ...A✔️ [True]
6A woman is engaged in the retail sale of widgets throughout the Un...{'A': \"No, because the woman's order was too indefinite to constit...DThe woman's purchase order constituted an offer to the manufacture...D✔️ [True]
7For which of these two scenarios does the main character (who uses...{'A': 'Wrong, Wrong', 'B': 'Wrong, Not wrong', 'C': 'Not wrong, Wr...DIn Scenario 1, the main character kills spiders because their frie...D✔️ [True]
8A 67-year-old woman has had fatigue, dry skin, brittle hair, swell...{'A': 'Chronic lymphocytic thyroiditis (Hashimoto disease)', 'B': ...AThe patient's symptoms, such as fatigue, dry skin, brittle hair, s...A✔️ [True]
9Weber said that the 'spirit of capitalism' could be traced back to:{'A': 'the movement towards religious pluralism', 'B': 'inspiratio...DTo answer this question, we need to consider the ideas of Max Webe...D✔️ [True]
10This question refers to the following information. The history of ...{'A': 'The Oxford Movement', 'B': 'Materialism and economic determ...BThe Communist Manifesto, written by Karl Marx and Friedrich Engels...B✔️ [True]
11A nongovernmental, not-for-profit organization held the following ...{'A': '$12,700', 'B': '$13,000', 'C': '$13,800', 'D': '$14,900'}DTo determine the amount of stock investments that should be report...D✔️ [True]
12If you were hired by a large company to develop a new training pro...{'A': 'needs analysis.', 'B': 'job evaluation.', 'C': 'summative e...ATo develop an effective training program, it's crucial to understa...A✔️ [True]
13How were the first metals worked in South America?{'A': 'casting', 'B': 'hammering', 'C': 'smelting', 'D': 'all of t...DThe first metals worked in South America involved various techniqu...D✔️ [True]
14Under the Articles of Confederation, the national government had t...{'A': 'negotiate treaties', 'B': 'collect taxes', 'C': 'establish ...AThe Articles of Confederation, which served as the first constitut...A✔️ [True]
15A large company has offices in two locations, one in New Jersey an...{'A': '$22,500 ', 'B': '$23,700 ', 'C': '$25,500 ', 'D': '$27,300 '}DTo find the mean salary paid to the office assistants in the compa...D✔️ [True]
16Which character on the TV show 'Friends' is a chef?{'A': 'Joey', 'B': 'Monica', 'C': 'Ross', 'D': 'Rachel'}BTo answer this question, we need to consider the main characters o...B✔️ [True]
17Which of the following events INITIATES puberty?{'A': 'pituitary gland releases FSH', 'B': 'pituitary gland releas...CThe initiation of puberty is a complex process involving the hypot...C✔️ [True]
18Which of the boys on the TV show 'My Three Sons' is adopted?{'A': 'Mike', 'B': 'Ernie', 'C': 'Chip', 'D': 'Robbie'}BThe TV show 'My Three Sons' features a family with three boys. The...B✔️ [True]
19What is the minimum value of the expression x + 4z as a function d...{'A': '0', 'B': '-2', 'C': '-sqrt(34)', 'D': '-sqrt(35)'}CTo find the minimum value of the expression x + 4z subject to the ...C✔️ [True]
20What will happen to the equilibrium price and the equilibrium quan...{'A': 'The equilibrium price will rise and the equilibrium quantit...AWhen producers of good A expect the price to be higher in the near...C
21Construct a complete truth table for the following argument. Then,...{'A': 'Valid', 'B': 'Invalid. Counterexample when M and O are true...ATo determine the validity of the argument, we first need to constr...B
22This question refers to the following information. \"If any person ...{'A': 'rural and urban interests.', 'B': 'federal law and state la...BThe passage describes a Pennsylvania law from 1826 that criminaliz...B✔️ [True]
23Which of the following is not an element of the marketing mix?{'A': 'Promotion.', 'B': 'Product.', 'C': 'Target market.', 'D': '...CThe marketing mix, also known as the 4 Ps, consists of Product, Pr...C✔️ [True]
24Which of the following must be done when universal screening data ...{'A': 'Changes must be made in the delivery of the core program.',...AWhen universal screening data indicate that very few students are ...A✔️ [True]
25A large man with red hair robbed a liquor store. Thereafter, a def...{'A': 'admissible as a prior identification.', 'B': \"admissible, f...BThe corrections officer's testimony is being offered to prove that...B✔️ [True]
26Good X is exchanged in a competitive market. Which of the followin...{'A': 'If the demand curve is perfectly elastic, the price rises b...DWhen an excise tax is imposed on the production of a good in a com...D✔️ [True]
27Of the following compounds, which is LEAST likely to behave as a L...{'A': 'BeCl2', 'B': 'MgCl2', 'C': 'ZnCl2', 'D': 'SCl2'}DTo determine which of the given compounds is least likely to behav...B
28Mr. Cleary’s class and Ms. Ntuala’s class go to use the computer l...{'A': '2', 'B': '6', 'C': '10', 'D': '14'}CTo find the maximum number of students who can have a computer to ...{C}
29As of December 1, year 2, a company obtained a $1,000,000 line of ...{'A': 'Current liabilities of $1,000,000; long-term liabilities of...CTo determine the presentation of the company's debt in its classif...C✔️ [True]
30Use indirect truth tables to determine whether the following argum...{'A': 'Valid', 'B': 'Invalid. Counterexample when P, Q, R, and S a...ATo determine the validity of the argument using indirect truth tab...C
31Which expression represents the phrase below? 3 fewer than a numbe...{'A': '3-p', 'B': 'p+3', 'C': '3/p', 'D': 'p-3'}DTo represent the phrase \"3 fewer than a number, p\", we need to und...D✔️ [True]
32The influenza virus is mainly controlled in special \"risk\" sectors...{'A': 'Hygiene', 'B': 'Vaccination', 'C': 'Antiviral drugs', 'D': ...BThe influenza virus can be controlled through various methods, but...B✔️ [True]
33What size of cannula would you use in a patient who needed a rapid...{'A': '18 gauge.', 'B': '20 gauge.', 'C': '22 gauge.', 'D': '24 ga...ATo determine the correct size of cannula for a rapid blood transfu...A✔️ [True]
34Which one of the following is not a characteristic of a team?{'A': 'Minimal and formal knowledge sharing', 'B': 'Collective out...ATo determine which one of the following is not a characteristic of...A✔️ [True]
35When developing a plan of care relating to the management of a per...{'A': 'physical and pharmacological needs only.', 'B': 'physical a...CTo develop an effective plan of care for managing a person's pain,...C✔️ [True]
36Which of the following teenagers have the highest pregnancy rates?{'A': 'U.S.', 'B': 'Canadian', 'C': 'Swedish', 'D': 'French'}ATo answer this question, we need to consider the available data on...A✔️ [True]
37Based on the analysis of oxygen isotope ratios in the shells of ga...{'A': 'It became increasingly complex due to increased rainfall an...BThe analysis of oxygen isotope ratios in the shells of gastropods ...B✔️ [True]
38Which is a 'rock' under the UN Convention of the Law of the Sea (U...{'A': 'Rocks are the islands that are less than 10 square miles', ...CTo answer this question, we need to understand the definition of a...C✔️ [True]
39The Gravitron is a carnival ride that looks like a large cylinder....{'A': 'μv^2/(rg)', 'B': 'r^2v^2/(μg)', 'C': 'rg/(μv^2)', 'D': 'Non...ATo solve this problem, we need to consider the forces acting on a ...A✔️ [True]
\n", "
" ], "text/plain": [ " question \\\n", "0 The first hominids could be described as: \n", "1 Investment demand most likely increases when \n", "2 Which of the following is a legitimate threat to societal security? \n", "3 Long-run aggregate supply is most likely to increase as the result of \n", "4 As of 2013, share of people in the India who think political parti... \n", "5 This question refers to the following information. \"I travelled th... \n", "6 A woman is engaged in the retail sale of widgets throughout the Un... \n", "7 For which of these two scenarios does the main character (who uses... \n", "8 A 67-year-old woman has had fatigue, dry skin, brittle hair, swell... \n", "9 Weber said that the 'spirit of capitalism' could be traced back to: \n", "10 This question refers to the following information. The history of ... \n", "11 A nongovernmental, not-for-profit organization held the following ... \n", "12 If you were hired by a large company to develop a new training pro... \n", "13 How were the first metals worked in South America? \n", "14 Under the Articles of Confederation, the national government had t... \n", "15 A large company has offices in two locations, one in New Jersey an... \n", "16 Which character on the TV show 'Friends' is a chef? \n", "17 Which of the following events INITIATES puberty? \n", "18 Which of the boys on the TV show 'My Three Sons' is adopted? \n", "19 What is the minimum value of the expression x + 4z as a function d... \n", "20 What will happen to the equilibrium price and the equilibrium quan... \n", "21 Construct a complete truth table for the following argument. Then,... \n", "22 This question refers to the following information. \"If any person ... \n", "23 Which of the following is not an element of the marketing mix? \n", "24 Which of the following must be done when universal screening data ... \n", "25 A large man with red hair robbed a liquor store. Thereafter, a def... \n", "26 Good X is exchanged in a competitive market. Which of the followin... \n", "27 Of the following compounds, which is LEAST likely to behave as a L... \n", "28 Mr. Cleary’s class and Ms. Ntuala’s class go to use the computer l... \n", "29 As of December 1, year 2, a company obtained a $1,000,000 line of ... \n", "30 Use indirect truth tables to determine whether the following argum... \n", "31 Which expression represents the phrase below? 3 fewer than a numbe... \n", "32 The influenza virus is mainly controlled in special \"risk\" sectors... \n", "33 What size of cannula would you use in a patient who needed a rapid... \n", "34 Which one of the following is not a characteristic of a team? \n", "35 When developing a plan of care relating to the management of a per... \n", "36 Which of the following teenagers have the highest pregnancy rates? \n", "37 Based on the analysis of oxygen isotope ratios in the shells of ga... \n", "38 Which is a 'rock' under the UN Convention of the Law of the Sea (U... \n", "39 The Gravitron is a carnival ride that looks like a large cylinder.... \n", "\n", " options \\\n", "0 {'A': 'quadrupedal knappers.', 'B': 'quadrupedal pongids.', 'C': '... \n", "1 {'A': 'real GDP decreases.', 'B': 'the cost of acquiring and maint... \n", "2 {'A': 'Political policies designed to alter demographic characteri... \n", "3 {'A': 'an increase in the real interest rate', 'B': 'increased inv... \n", "4 {'A': '26%', 'B': '46%', 'C': '66%', 'D': '86%'} \n", "5 {'A': 'The ability of commerce to foster cultural diffusion', 'B':... \n", "6 {'A': \"No, because the woman's order was too indefinite to constit... \n", "7 {'A': 'Wrong, Wrong', 'B': 'Wrong, Not wrong', 'C': 'Not wrong, Wr... \n", "8 {'A': 'Chronic lymphocytic thyroiditis (Hashimoto disease)', 'B': ... \n", "9 {'A': 'the movement towards religious pluralism', 'B': 'inspiratio... \n", "10 {'A': 'The Oxford Movement', 'B': 'Materialism and economic determ... \n", "11 {'A': '$12,700', 'B': '$13,000', 'C': '$13,800', 'D': '$14,900'} \n", "12 {'A': 'needs analysis.', 'B': 'job evaluation.', 'C': 'summative e... \n", "13 {'A': 'casting', 'B': 'hammering', 'C': 'smelting', 'D': 'all of t... \n", "14 {'A': 'negotiate treaties', 'B': 'collect taxes', 'C': 'establish ... \n", "15 {'A': '$22,500 ', 'B': '$23,700 ', 'C': '$25,500 ', 'D': '$27,300 '} \n", "16 {'A': 'Joey', 'B': 'Monica', 'C': 'Ross', 'D': 'Rachel'} \n", "17 {'A': 'pituitary gland releases FSH', 'B': 'pituitary gland releas... \n", "18 {'A': 'Mike', 'B': 'Ernie', 'C': 'Chip', 'D': 'Robbie'} \n", "19 {'A': '0', 'B': '-2', 'C': '-sqrt(34)', 'D': '-sqrt(35)'} \n", "20 {'A': 'The equilibrium price will rise and the equilibrium quantit... \n", "21 {'A': 'Valid', 'B': 'Invalid. Counterexample when M and O are true... \n", "22 {'A': 'rural and urban interests.', 'B': 'federal law and state la... \n", "23 {'A': 'Promotion.', 'B': 'Product.', 'C': 'Target market.', 'D': '... \n", "24 {'A': 'Changes must be made in the delivery of the core program.',... \n", "25 {'A': 'admissible as a prior identification.', 'B': \"admissible, f... \n", "26 {'A': 'If the demand curve is perfectly elastic, the price rises b... \n", "27 {'A': 'BeCl2', 'B': 'MgCl2', 'C': 'ZnCl2', 'D': 'SCl2'} \n", "28 {'A': '2', 'B': '6', 'C': '10', 'D': '14'} \n", "29 {'A': 'Current liabilities of $1,000,000; long-term liabilities of... \n", "30 {'A': 'Valid', 'B': 'Invalid. Counterexample when P, Q, R, and S a... \n", "31 {'A': '3-p', 'B': 'p+3', 'C': '3/p', 'D': 'p-3'} \n", "32 {'A': 'Hygiene', 'B': 'Vaccination', 'C': 'Antiviral drugs', 'D': ... \n", "33 {'A': '18 gauge.', 'B': '20 gauge.', 'C': '22 gauge.', 'D': '24 ga... \n", "34 {'A': 'Minimal and formal knowledge sharing', 'B': 'Collective out... \n", "35 {'A': 'physical and pharmacological needs only.', 'B': 'physical a... \n", "36 {'A': 'U.S.', 'B': 'Canadian', 'C': 'Swedish', 'D': 'French'} \n", "37 {'A': 'It became increasingly complex due to increased rainfall an... \n", "38 {'A': 'Rocks are the islands that are less than 10 square miles', ... \n", "39 {'A': 'μv^2/(rg)', 'B': 'r^2v^2/(μg)', 'C': 'rg/(μv^2)', 'D': 'Non... \n", "\n", " example_answer \\\n", "0 C \n", "1 C \n", "2 B \n", "3 B \n", "4 D \n", "5 A \n", "6 D \n", "7 D \n", "8 A \n", "9 D \n", "10 B \n", "11 D \n", "12 A \n", "13 D \n", "14 A \n", "15 D \n", "16 B \n", "17 C \n", "18 B \n", "19 C \n", "20 A \n", "21 A \n", "22 B \n", "23 C \n", "24 A \n", "25 B \n", "26 D \n", "27 D \n", "28 C \n", "29 C \n", "30 A \n", "31 D \n", "32 B \n", "33 A \n", "34 A \n", "35 C \n", "36 A \n", "37 B \n", "38 C \n", "39 A \n", "\n", " reasoning \\\n", "0 The first hominids are known for their transition from a quadruped... \n", "1 Investment demand is influenced by several factors, including expe... \n", "2 To determine a legitimate threat to societal security, we must con... \n", "3 The long-run aggregate supply (LRAS) curve represents the total am... \n", "4 To answer this question, we need to consider the available data an... \n", "5 The journal entry by Ibn Battuta describes the city of 'Aden as a ... \n", "6 The woman's purchase order constituted an offer to the manufacture... \n", "7 In Scenario 1, the main character kills spiders because their frie... \n", "8 The patient's symptoms, such as fatigue, dry skin, brittle hair, s... \n", "9 To answer this question, we need to consider the ideas of Max Webe... \n", "10 The Communist Manifesto, written by Karl Marx and Friedrich Engels... \n", "11 To determine the amount of stock investments that should be report... \n", "12 To develop an effective training program, it's crucial to understa... \n", "13 The first metals worked in South America involved various techniqu... \n", "14 The Articles of Confederation, which served as the first constitut... \n", "15 To find the mean salary paid to the office assistants in the compa... \n", "16 To answer this question, we need to consider the main characters o... \n", "17 The initiation of puberty is a complex process involving the hypot... \n", "18 The TV show 'My Three Sons' features a family with three boys. The... \n", "19 To find the minimum value of the expression x + 4z subject to the ... \n", "20 When producers of good A expect the price to be higher in the near... \n", "21 To determine the validity of the argument, we first need to constr... \n", "22 The passage describes a Pennsylvania law from 1826 that criminaliz... \n", "23 The marketing mix, also known as the 4 Ps, consists of Product, Pr... \n", "24 When universal screening data indicate that very few students are ... \n", "25 The corrections officer's testimony is being offered to prove that... \n", "26 When an excise tax is imposed on the production of a good in a com... \n", "27 To determine which of the given compounds is least likely to behav... \n", "28 To find the maximum number of students who can have a computer to ... \n", "29 To determine the presentation of the company's debt in its classif... \n", "30 To determine the validity of the argument using indirect truth tab... \n", "31 To represent the phrase \"3 fewer than a number, p\", we need to und... \n", "32 The influenza virus can be controlled through various methods, but... \n", "33 To determine the correct size of cannula for a rapid blood transfu... \n", "34 To determine which one of the following is not a characteristic of... \n", "35 To develop an effective plan of care for managing a person's pain,... \n", "36 To answer this question, we need to consider the available data on... \n", "37 The analysis of oxygen isotope ratios in the shells of gastropods ... \n", "38 To answer this question, we need to understand the definition of a... \n", "39 To solve this problem, we need to consider the forces acting on a ... \n", "\n", " pred_answer metric \n", "0 C ✔️ [True] \n", "1 C ✔️ [True] \n", "2 B ✔️ [True] \n", "3 B ✔️ [True] \n", "4 D ✔️ [True] \n", "5 A ✔️ [True] \n", "6 D ✔️ [True] \n", "7 D ✔️ [True] \n", "8 A ✔️ [True] \n", "9 D ✔️ [True] \n", "10 B ✔️ [True] \n", "11 D ✔️ [True] \n", "12 A ✔️ [True] \n", "13 D ✔️ [True] \n", "14 A ✔️ [True] \n", "15 D ✔️ [True] \n", "16 B ✔️ [True] \n", "17 C ✔️ [True] \n", "18 B ✔️ [True] \n", "19 C ✔️ [True] \n", "20 C \n", "21 B \n", "22 B ✔️ [True] \n", "23 C ✔️ [True] \n", "24 A ✔️ [True] \n", "25 B ✔️ [True] \n", "26 D ✔️ [True] \n", "27 B \n", "28 {C} \n", "29 C ✔️ [True] \n", "30 C \n", "31 D ✔️ [True] \n", "32 B ✔️ [True] \n", "33 A ✔️ [True] \n", "34 A ✔️ [True] \n", "35 C ✔️ [True] \n", "36 A ✔️ [True] \n", "37 B ✔️ [True] \n", "38 C ✔️ [True] \n", "39 A ✔️ [True] " ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 842 ms, sys: 161 ms, total: 1 s\n", "Wall time: 22.1 s\n" ] } ], "source": [ "%%time\n", "\n", "print(\"Starting execution...\")\n", "# eval_subset_size = len(testset)\n", "score, results, all_scores = evaluate(\n", " program,\n", " devset=testset[:40],\n", ")" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "import csv\n", "\n", "with open(\"my_results.csv\", mode=\"w\", newline=\"\", encoding=\"utf-8\") as f:\n", " writer = csv.writer(f)\n", " # If `results` is a list of lists, just write rows:\n", " for row in results:\n", " writer.writerow(row)\n" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Results DataFrame:\n", " Example_Index \\\n", "0 0 \n", "1 1 \n", "2 2 \n", "3 3 \n", "4 4 \n", "5 5 \n", "6 6 \n", "7 7 \n", "8 8 \n", "9 9 \n", "10 10 \n", "11 11 \n", "12 12 \n", "13 13 \n", "14 14 \n", "15 15 \n", "16 16 \n", "17 17 \n", "18 18 \n", "19 19 \n", "20 20 \n", "21 21 \n", "22 22 \n", "23 23 \n", "24 24 \n", "25 25 \n", "26 26 \n", "27 27 \n", "28 28 \n", "29 29 \n", "30 30 \n", "31 31 \n", "32 32 \n", "33 33 \n", "34 34 \n", "35 35 \n", "36 36 \n", "37 37 \n", "38 38 \n", "39 39 \n", "40 40 \n", "41 41 \n", "42 42 \n", "43 43 \n", "44 44 \n", "45 45 \n", "46 46 \n", "47 47 \n", "48 48 \n", "49 49 \n", "50 50 \n", "51 51 \n", "52 52 \n", "53 53 \n", "54 54 \n", "55 55 \n", "56 56 \n", "57 57 \n", "58 58 \n", "\n", " Prediction \\\n", "0 Prediction(\\n reasoning=\"To solve this problem, we need to appl... \n", "1 Prediction(\\n reasoning='The capacitance of a parallel-plate ca... \n", "2 Prediction(\\n reasoning=\"To calculate the contribution to $U_{\\... \n", "3 Prediction(\\n reasoning='To translate the given statement \"Abdu... \n", "4 Prediction(\\n reasoning='The passage states that perestroika, o... \n", "5 Prediction(\\n reasoning='The refractory period is a phase that ... \n", "6 Prediction(\\n reasoning=\"To solve this problem, we need to cons... \n", "7 Prediction(\\n reasoning=\"To find the weight of one truck, we fi... \n", "8 Prediction(\\n reasoning='To estimate the magnitude of the elect... \n", "9 Prediction(\\n reasoning=\"The patient presents with nausea, vomi... \n", "10 Prediction(\\n reasoning=\"The builder properly rejected the ship... \n", "11 Prediction(\\n reasoning=\"The man conveyed a right-of-way easeme... \n", "12 Prediction(\\n reasoning='To answer this question, we need to co... \n", "13 Prediction(\\n reasoning='To answer this question, we need to un... \n", "14 Prediction(\\n reasoning='To determine the final angular momentu... \n", "15 Prediction(\\n reasoning='To calculate the molecular weight of t... \n", "16 Prediction(\\n reasoning=\"The appropriateness and usefulness of ... \n", "17 Prediction(\\n reasoning='To find the angular magnification of t... \n", "18 Prediction(\\n reasoning='Multiple-choice questions are often ea... \n", "19 Prediction(\\n reasoning='The question describes a treatment pro... \n", "20 Prediction(\\n reasoning='To solve this problem, we need to use ... \n", "21 Prediction(\\n reasoning=\"Since the DNA molecule is 20 percent a... \n", "22 Prediction(\\n reasoning='The process described has NADP+, ADP, ... \n", "23 Prediction(\\n reasoning='The question describes a scenario wher... \n", "24 Prediction(\\n reasoning='The lac operon is a genetic regulatory... \n", "25 Prediction(\\n reasoning='To answer this question, we need to id... \n", "26 Prediction(\\n reasoning='To find the total tax Mr. Howard pays,... \n", "27 Prediction(\\n reasoning='The amplitude of a damped harmonic osc... \n", "28 Prediction(\\n reasoning='The statement \"Men are better drivers ... \n", "29 Prediction(\\n reasoning='To find the desired level of money bal... \n", "30 Prediction(\\n reasoning='The clinical presentation described fo... \n", "31 Prediction(\\n reasoning=\"To solve this problem, we first need t... \n", "32 Prediction(\\n reasoning='To determine where Mr. Balfour should ... \n", "33 Prediction(\\n reasoning=\"To solve this problem, we need to unde... \n", "34 Prediction(\\n reasoning='The patient presents with symptoms of ... \n", "35 Prediction(\\n reasoning='The \"sex-change\" in bacteria is relate... \n", "36 Prediction(\\n reasoning='To answer this question, we need to co... \n", "37 Prediction(\\n reasoning=\"To determine if the burger joint owner... \n", "38 Prediction(\\n reasoning=\"To find the pH of a 0.1 M solution of ... \n", "39 Prediction(\\n reasoning='The question describes a scenario wher... \n", "40 Prediction(\\n reasoning='The common law principles regarding th... \n", "41 Prediction(\\n reasoning='To find the concentration of the sodiu... \n", "42 Prediction(\\n reasoning='LC oscillators, also known as tank cir... \n", "43 Prediction(\\n reasoning='To determine which of the given proces... \n", "44 Prediction(\\n reasoning=\"The marginal utility approach explains... \n", "45 Prediction(\\n reasoning=\"To solve this problem, we need to calc... \n", "46 Prediction(\\n reasoning=\"The correct answer is based on the pri... \n", "47 Prediction(\\n reasoning=\"To find the posterior probability \\\\(P... \n", "48 Prediction(\\n reasoning=\"To determine the day of the week on wh... \n", "49 Prediction(\\n reasoning=\"To solve this problem, we need to unde... \n", "50 Prediction(\\n reasoning='To calculate the total change in entro... \n", "51 Prediction(\\n reasoning='To determine when the tenant\\'s statut... \n", "52 Prediction(\\n reasoning=\"To solve this problem, we first need t... \n", "53 Prediction(\\n reasoning=\"To determine which type of organisatio... \n", "54 Prediction(\\n reasoning='To answer this question, we need to un... \n", "55 Prediction(\\n reasoning=\"To estimate a numerical measurement of... \n", "56 Prediction(\\n reasoning='Anscombe\\'s work focuses on the philos... \n", "57 Prediction(\\n reasoning=\"The practice of requiring students to ... \n", "58 Prediction(\\n reasoning=\"The production period in the history o... \n", "\n", " Score \n", "0 True \n", "1 True \n", "2 False \n", "3 True \n", "4 True \n", "5 True \n", "6 True \n", "7 True \n", "8 False \n", "9 True \n", "10 True \n", "11 True \n", "12 True \n", "13 True \n", "14 True \n", "15 False \n", "16 True \n", "17 True \n", "18 True \n", "19 True \n", "20 False \n", "21 False \n", "22 True \n", "23 True \n", "24 True \n", "25 True \n", "26 True \n", "27 False \n", "28 True \n", "29 True \n", "30 True \n", "31 False \n", "32 True \n", "33 False \n", "34 True \n", "35 True \n", "36 False \n", "37 True \n", "38 False \n", "39 True \n", "40 True \n", "41 True \n", "42 False \n", "43 True \n", "44 True \n", "45 True \n", "46 True \n", "47 True \n", "48 False \n", "49 True \n", "50 True \n", "51 True \n", "52 False \n", "53 True \n", "54 True \n", "55 True \n", "56 True \n", "57 False \n", "58 False \n" ] } ], "source": [ "import pandas as pd\n", "\n", "df = pd.DataFrame([(i, str(pred), score) for i, (example, pred, score) in enumerate(results)],\n", " columns=['Example_Index', 'Prediction', 'Score'])\n", "print(\"\\nResults DataFrame:\")\n", "print(df)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "comparison_df = pd.DataFrame([{\n", " 'Question': example.inputs()['question'][:100] + '...' if len(example.inputs()['question']) > 100 else example.inputs()['question'],\n", " 'Predicted Answer': prediction.answer if hasattr(prediction, 'answer') else str(prediction),\n", " 'Correct Answer': example.answer,\n", " 'Is Correct': '✔️' if score else '❌'\n", "} for example, prediction, score in results])\n", "\n", "\n", "csv_filename = 'prediction_results.csv'\n", "comparison_df.to_csv(csv_filename, index=False, encoding='utf-8-sig') # utf-8-sig to handle special characters\n", "print(f\"\\nResults saved to {csv_filename}\")\n", " \n", "pd.set_option('display.max_colwidth', None)\n", "print(\"\\nPredictions vs Actual Answers:\")\n", "print(comparison_df)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "%%time\n", "\n", "print(\"Starting execution...\")\n", "evaluate(\n", " program,\n", " devset=testset,\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Optimize Subset + Evaluation" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "import logging\n", "logging.getLogger('dspy').setLevel(logging.DEBUG)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%time\n", "subset_size = 200\n", "optimizer = dspy.MIPROv2(\n", " metric=benchmark.metric,\n", " auto=\"light\",\n", " num_threads=NUM_THREADS,\n", " task_model=TASK_MODEL,\n", " prompt_model=PROMPT_MODEL,\n", " max_bootstrapped_demos=FEW_SHOTS * 2,\n", " max_labeled_demos=FEW_SHOTS,\n", ")\n", "\n", "optimized_program = optimizer.compile(\n", " program,\n", " trainset=trainset[:subset_size],\n", " valset=valset[:subset_size],\n", " requires_permission_to_run=False,\n", ")" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "BEST PROMPT:\n", " You are a helpful assistant.\n", "CPU times: user 286 μs, sys: 24 μs, total: 310 μs\n", "Wall time: 265 μs\n" ] } ], "source": [ "%%time\n", "print(\"BEST PROMPT:\\n\", optimized_program.signature.instructions)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Module signature instructions:\n" ] }, { "ename": "AttributeError", "evalue": "'str' object has no attribute 'instructions'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[15], line 4\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(module, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124msignature\u001b[39m\u001b[38;5;124m'\u001b[39m):\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mModule \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmodule_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m instructions:\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m----> 4\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[43mmodule\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msignature\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minstructions\u001b[49m)\n", "\u001b[0;31mAttributeError\u001b[0m: 'str' object has no attribute 'instructions'" ] } ], "source": [ "for module_name, module in optimized_program.__dict__.items():\n", " if hasattr(module, 'signature'):\n", " print(f\"\\nModule {module_name} instructions:\")\n", " print(module.signature.instructions)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "BEST EXAMPLES:\n", " []\n" ] } ], "source": [ "print(\"BEST EXAMPLES:\\n\", optimized_program.demos)" ] }, { "cell_type": "code", "execution_count": 51, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "BEST EXAMPLES:\n", " [\n", " {\n", " \"question\": \"Dr. Ryan is a psychotherapist in a small town. She receives a phone call from a man she was sexually involved with several years ago. The relationship lasted only three months and ended amicably. The man is now married and is having problems unrelated to their past relationship. He has called to see if he can begin seeing Dr. Ryan in therapy. Dr. Ryan should:\",\n", " \"options\": {\n", " \"A\": \"see the man but disclose their past relationship to a supervisor or colleague for transparency.\",\n", " \"B\": \"deny the man's request without providing a reason.\",\n", " \"C\": \"see the man only if their relationship ended more than two years ago and she determines that it will not interfere with her objectivity.\",\n", " \"D\": \"see the man only if she feels she can maintain professional boundaries despite their past.\",\n", " \"E\": \"refer the man to a colleague.\",\n", " \"F\": \"continue therapy with the man but avoid discussing their past relationship.\",\n", " \"G\": \"see the man only if she determines that their past relationship will not interfere with her objectivity.\",\n", " \"H\": \"provide therapy for the man over the phone or through online sessions to maintain physical boundaries.\",\n", " \"I\": \"see the man only if his wife is aware of their past relationship and consents to the therapy.\",\n", " \"J\": \"see the man but discuss the potential for con\\ufb02icts with him before beginning treatment.\"\n", " },\n", " \"answer\": \"E\",\n", " \"reasoning\": \"The American Psychological Association's Ethics Code states that psychologists should refrain from entering into a therapeutic relationship with individuals with whom they have a personal relationship that could impair their professional judgment or create a risk of exploitation. In this case, Dr. Ryan's past sexual relationship with the man could potentially create a dual relationship, which may impair her objectivity and professional judgment. While the relationship ended amicably and was several years ago, it is still important for Dr. Ryan to consider whether her past involvement with the man could influence her ability to provide unbiased and effective therapy. The most appropriate course of action would be for Dr. Ryan to refer the man to a colleague, as this would avoid any potential conflict of interest or dual relationship. This approach prioritizes the man's need for therapy while also maintaining the ethical standards of the profession.\",\n", " \"input_keys\": null\n", " },\n", " {\n", " \"question\": \"A writer sent a four-page synopsis of an idea for a new television series to a Hollywood producer. He sent it in response to an ad for new ideas in an industry publication. He discussed it with the producer's assistant in a phone call, and mentioned his expectation of compensation. She said, \\\"Well, of course, we always pay for a writer's work.\\\" She said she would go over it with her boss to see if he liked it. Several months later, the writer saw a casting call for a new series. The plot and characters were nearly identical to those described in his synopsis. He sued the producer for breach of contract. The producer defended by arguing that there was no contract. What is the likely ruling of the court?\",\n", " \"options\": {\n", " \"A\": \"The court will rule that there can be no contract as the writer didn't explicitly state his expectation for compensation for the idea itself.\",\n", " \"B\": \"The court will rule that there was an implied in fact contract between the parties based on the conduct that they manifested.\",\n", " \"C\": \"The court will rule that there was no consideration specifically mentioned and under those circumstances, the law viewed the synopsis as a gift to the producer.\",\n", " \"D\": \"The court will rule that there is an express contract because there was a meeting of the mind and mutual assent to the basic terms.\",\n", " \"E\": \"The court will rule that there was a unilateral contract, with the producer obligated to pay only if he used the idea.\",\n", " \"F\": \"The court will rule that the writer's case is invalid as there was no written agreement.\",\n", " \"G\": \"The court will rule that the producer's assistant lacked the authority to form a contract, and thus, no contract exists.\",\n", " \"H\": \"There were too many terms left out for this to be a contract, and it was too indefinite for the court to imply what the terms might have been.\",\n", " \"I\": \"The court will rule that the phone call between the writer and the producer's assistant formed a verbal contract.\",\n", " \"J\": null\n", " },\n", " \"answer\": \"B\",\n", " \"reasoning\": \"\",\n", " \"input_keys\": null\n", " },\n", " {\n", " \"question\": \"A test charge q C, moving with a velocityv= (i_x +i_y) m/sec, experiences no force in a region of electric and magnetic fields. If the magnetic flux density B= (i_x - 2i_z)Wb/m^2, findE.\",\n", " \"options\": {\n", " \"A\": \"(i_z - 2i_y) volts/m\",\n", " \"B\": \"(-i_x + i_y) volts/m\",\n", " \"C\": \"(3i_x - i_y - 2i_z) volts/m\",\n", " \"D\": \"(2ix - 2i_y +i_z) volts/m\",\n", " \"E\": \"(i_x - 2i_z) volts/m\",\n", " \"F\": \"(2i_x - i_y) volts/m\",\n", " \"G\": \"(i_x +i_y) volts/m\",\n", " \"H\": \"(2ix + 2i_y -i_z) volts/m\",\n", " \"I\": \"(i_y + i_z) volts/m\",\n", " \"J\": \"(i_x + 2i_y + i_z) volts/m\"\n", " },\n", " \"answer\": \"D\",\n", " \"reasoning\": \"\",\n", " \"input_keys\": null\n", " }\n", "]\n" ] } ], "source": [ "import json\n", "\n", "def example_to_dict(example):\n", " return {\n", " 'question': example.question,\n", " 'options': example.options,\n", " 'answer': example.answer,\n", " 'reasoning': example.reasoning,\n", " 'input_keys': list(example.input_keys) if hasattr(example, 'input_keys') else None\n", " }\n", "\n", "examples_json = [example_to_dict(example) for example in optimized_program.demos]\n", "print(\"BEST EXAMPLES:\\n\", json.dumps(examples_json, indent=2))" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Average Metric: 37.00 / 40 (92.5%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:00<00:00, 2657.86it/s]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2025/01/30 13:40:18 INFO dspy.evaluate.evaluate: Average Metric: 37 / 40 (92.5%)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
questionoptionsexample_answerreasoningpred_answermetric
0For which of these two scenarios does the main character (who uses...{'A': 'Wrong, Wrong', 'B': 'Wrong, Not wrong', 'C': 'Not wrong, Wr...ATo determine which scenario involves the main character doing some...A✔️ [True]
1Which of the following is not a TV cartoon duo?{'A': 'Chip and Dale', 'B': 'Beavis and Butt-head', 'C': 'Simon an...CTo determine which of the following is not a TV cartoon duo, we ne...C✔️ [True]
2Market testing studies use __________ to carry out controlled expe...{'A': 'Test markets.', 'B': 'Data mining.', 'C': 'Analytical infor...ATo answer this question, we need to understand the concept of mark...A✔️ [True]
3Which of the following is the essential feature of Conduct Disorder{'A': 'Lack of appropriate guilt or remorse', 'B': 'Major rule or ...BTo answer this question, we need to consider the characteristics o...B✔️ [True]
4For which of these two scenarios does the main character (who uses...{'A': 'Wrong, Wrong', 'B': 'Wrong, Not wrong', 'C': 'Not wrong, Wr...CTo determine which scenario involves the main character doing some...C✔️ [True]
5An off-duty police officer was standing on a street corner waiting...{'A': 'assault with a deadly weapon.', 'B': 'involuntary manslaugh...BThe police officer's actions, although intended to frighten the ma...B✔️ [True]
6There are 2,000 insurance agents licensed under the general licens...{'A': 'constitutional, because a state license is a privilege and ...CThe key issue here is whether the state statute revoking the insur...C✔️ [True]
7Millions of immigrant children who entered the United States learn...{'A': 'Acculturation', 'B': 'Collective behavior', 'C': 'Social st...AThe concept described in the question involves immigrant children ...A✔️ [True]
8This question refers to the following information. \"When we were k...{'A': 'The labor union movement.', 'B': 'The civil rights movement...DThe Port Huron Statement, as excerpted, discusses themes of social...D✔️ [True]
9Who is the eighth-century CE female poet worshipped throughout ma...{'A': 'Andal', 'B': 'Devi', 'C': 'Ganga', 'D': 'Kali'}AThe question asks for an eighth-century CE female poet who is wors...A✔️ [True]
10Clifford and Lucia Pauling, in Senior View, told us that physical ...{'A': 'Are rapid and frightening', 'B': 'Can be offset by meditati...CTo answer this question, we need to consider the context of physic...C✔️ [True]
11Which statement best describes one of Dworkin's central arguments ...{'A': 'Morality plays no role in the concept of law.', 'B': 'Moral...DTo answer this question, we need to consider the central arguments...D✔️ [True]
12Light that is not transmitted by opaque materials is{'A': 'reflected or converted to internal energy in the material.'...AWhen light hits an opaque material, it does not pass through becau...A✔️ [True]
13Which of the following was not defined by Giddens (1998) as part o...{'A': 'the democratization of the family', 'B': 'putting an end to...BTo answer this question, we need to consider the key components of...B✔️ [True]
14The U.S. economy currently suffers a recessionary gap and a budget...{'A': 'Tax increase \\xa0\\xa0\\xa0 Demand rises \\xa0\\xa0\\xa0 Falling...CTo address a recessionary gap and a budget deficit through fiscal ...C✔️ [True]
15A company president is concerned about the low motivation and sati...{'A': 'ERG theory', 'B': 'expectancy theory', 'C': 'equity theory'...DThe scenario describes a situation where a company president imple...D✔️ [True]
16What characteristic is not representative of a type IIb muscle fib...{'A': 'Low oxidative capacity', 'B': 'High fatigue resistance', 'C...BTo answer this question, we need to understand the characteristics...B✔️ [True]
17The energy for all forms of muscle contraction is provided by:{'A': 'ATP.', 'B': 'ADP.', 'C': 'phosphocreatine.', 'D': 'oxidativ...ATo answer this question, we need to understand the role of differe...A✔️ [True]
18The main factor preventing subsistence economies from advancing ec...{'A': 'a currency.', 'B': 'a well-connected transportation infrast...DTo address this question, let's consider what subsistence economie...B
19The primary research method used by developmental psychologists is{'A': 'case study', 'B': 'cross-sectional research', 'C': 'natural...BDevelopmental psychologists often aim to understand how individual...B✔️ [True]
20Kevin wants shoes and grows turnips. Lisa wants turnips and makes ...{'A': 'Store of value', 'B': 'Unit of account', 'C': 'Medium of ex...CIn this scenario, Kevin, Lisa, and Bob have different needs and pr...C✔️ [True]
21The Federal Reserve implements an expansionary policy by doing whi...{'A': 'Buying Treasury bonds in the open market', 'B': 'Raising th...ATo answer this question, we need to understand the tools the Feder...A✔️ [True]
22Why do political scientists identify the presidential elections of...{'A': 'The issues at stake in those elections were more important ...BTo answer this question, we need to understand what is meant by \"c...B✔️ [True]
23An entity engaged an accountant to review its financial statements...{'A': 'Withdrawn from the engagement because the entity has not be...CWhen an accountant is engaged to review financial statements in ac...C✔️ [True]
24Which of these statements defines the Copenhagen School's view of ...{'A': 'Security is a socially constructed concept, relative to the...AThe Copenhagen School's view of security is rooted in the concept ...A✔️ [True]
25An object of mass m1 experiences a linear, elastic collision with ...{'A': 'The final speed of object 1', 'B': 'The initial speed of ob...DTo determine the mass of the second object in a linear, elastic co...D✔️ [True]
26This question refers to the following information. \"Thereupon it w...{'A': 'Portuguese explorers were convinced that the route to the N...DThe Treaty of Tordesillas, signed in 1494, was an agreement betwee...D✔️ [True]
27When both short-run aggregate supply and aggregate demand increase...{'A': 'The price level rises but real GDP falls.', 'B': 'Both the ...DWhen both short-run aggregate supply and aggregate demand increase...D✔️ [True]
28This question refers to the following information. \"The challenge ...{'A': 'the Civil Rights Act of 1964', 'B': 'Medicare and Medicaid'...DTo answer this question, we need to analyze the given text and opt...D✔️ [True]
29Ada Lovelace is credited with being the first person to have made ...{'A': 'a computer program', 'B': 'a souffle', 'C': 'a brassiere', ...ATo answer this question, we need to consider the historical contri...A✔️ [True]
30How are new polyomaviruses detailed{'A': 'Shot gun sequencing', 'B': 'Cultivation in human neural cel...CTo detail new polyomaviruses, we must consider the methods used fo...C✔️ [True]
31What is the square root of 81 squared?{'A': '9', 'B': '27', 'C': '81', 'D': '729'}CTo find the square root of 81 squared, we first calculate 81 squar...C✔️ [True]
32For which of these two scenarios does the main character (who uses...{'A': 'Wrong, Wrong', 'B': 'Wrong, Not wrong', 'C': 'Not wrong, Wr...CTo determine which scenario involves the main character doing some...C✔️ [True]
33This question refers to the following information. \"The conscience...{'A': 'Passage of the Pure Food and Water Act', 'B': 'Creation of ...DThe Progressive Party Platform of 1912 emphasizes the importance o...D✔️ [True]
34A 50-year-old nurse who had been fired from his job at ahospital t...{'A': 'No, because sanctions are not proper against the attorney o...BThe court is likely to grant the hospital's motion for sanctions b...C
35According to John Money, __________ refers to a person's private i...{'A': 'Gender identity', 'B': 'Sexual nomenclature', 'C': 'Gender ...ATo answer this question, we need to understand the concept of gend...A✔️ [True]
36Rounded to the nearest 10,000, the population of Louisiana was 4,5...{'A': '4,500,321', 'B': '4,524,491', 'C': '4,533,372', 'D': '4,535...CTo find the actual population of Louisiana in 2010, we need to con...C✔️ [True]
37Compared with the sound you hear from the siren of a stationary fi...{'A': 'speed', 'B': 'frequency', 'C': 'wavelength', 'D': 'All of t...BThe question is about the Doppler effect, which describes how the ...B✔️ [True]
38______ regulation goes further than the ______ that CSOs have with...{'A': 'Legal, Interactions, Mechanics', 'B': 'Civil, Relations, Ou...BTo solve this, let's break down the components of the sentence and...C
39Aristotle saw his theory of the virtues as:{'A': 'final and complete.', 'B': 'open-ended and revisable.', 'C'...BAristotle's theory of the virtues, as outlined in his Nicomachean ...B✔️ [True]
\n", "
" ], "text/plain": [ " question \\\n", "0 For which of these two scenarios does the main character (who uses... \n", "1 Which of the following is not a TV cartoon duo? \n", "2 Market testing studies use __________ to carry out controlled expe... \n", "3 Which of the following is the essential feature of Conduct Disorder \n", "4 For which of these two scenarios does the main character (who uses... \n", "5 An off-duty police officer was standing on a street corner waiting... \n", "6 There are 2,000 insurance agents licensed under the general licens... \n", "7 Millions of immigrant children who entered the United States learn... \n", "8 This question refers to the following information. \"When we were k... \n", "9 Who is the eighth-century CE female poet worshipped throughout ma... \n", "10 Clifford and Lucia Pauling, in Senior View, told us that physical ... \n", "11 Which statement best describes one of Dworkin's central arguments ... \n", "12 Light that is not transmitted by opaque materials is \n", "13 Which of the following was not defined by Giddens (1998) as part o... \n", "14 The U.S. economy currently suffers a recessionary gap and a budget... \n", "15 A company president is concerned about the low motivation and sati... \n", "16 What characteristic is not representative of a type IIb muscle fib... \n", "17 The energy for all forms of muscle contraction is provided by: \n", "18 The main factor preventing subsistence economies from advancing ec... \n", "19 The primary research method used by developmental psychologists is \n", "20 Kevin wants shoes and grows turnips. Lisa wants turnips and makes ... \n", "21 The Federal Reserve implements an expansionary policy by doing whi... \n", "22 Why do political scientists identify the presidential elections of... \n", "23 An entity engaged an accountant to review its financial statements... \n", "24 Which of these statements defines the Copenhagen School's view of ... \n", "25 An object of mass m1 experiences a linear, elastic collision with ... \n", "26 This question refers to the following information. \"Thereupon it w... \n", "27 When both short-run aggregate supply and aggregate demand increase... \n", "28 This question refers to the following information. \"The challenge ... \n", "29 Ada Lovelace is credited with being the first person to have made ... \n", "30 How are new polyomaviruses detailed \n", "31 What is the square root of 81 squared? \n", "32 For which of these two scenarios does the main character (who uses... \n", "33 This question refers to the following information. \"The conscience... \n", "34 A 50-year-old nurse who had been fired from his job at ahospital t... \n", "35 According to John Money, __________ refers to a person's private i... \n", "36 Rounded to the nearest 10,000, the population of Louisiana was 4,5... \n", "37 Compared with the sound you hear from the siren of a stationary fi... \n", "38 ______ regulation goes further than the ______ that CSOs have with... \n", "39 Aristotle saw his theory of the virtues as: \n", "\n", " options \\\n", "0 {'A': 'Wrong, Wrong', 'B': 'Wrong, Not wrong', 'C': 'Not wrong, Wr... \n", "1 {'A': 'Chip and Dale', 'B': 'Beavis and Butt-head', 'C': 'Simon an... \n", "2 {'A': 'Test markets.', 'B': 'Data mining.', 'C': 'Analytical infor... \n", "3 {'A': 'Lack of appropriate guilt or remorse', 'B': 'Major rule or ... \n", "4 {'A': 'Wrong, Wrong', 'B': 'Wrong, Not wrong', 'C': 'Not wrong, Wr... \n", "5 {'A': 'assault with a deadly weapon.', 'B': 'involuntary manslaugh... \n", "6 {'A': 'constitutional, because a state license is a privilege and ... \n", "7 {'A': 'Acculturation', 'B': 'Collective behavior', 'C': 'Social st... \n", "8 {'A': 'The labor union movement.', 'B': 'The civil rights movement... \n", "9 {'A': 'Andal', 'B': 'Devi', 'C': 'Ganga', 'D': 'Kali'} \n", "10 {'A': 'Are rapid and frightening', 'B': 'Can be offset by meditati... \n", "11 {'A': 'Morality plays no role in the concept of law.', 'B': 'Moral... \n", "12 {'A': 'reflected or converted to internal energy in the material.'... \n", "13 {'A': 'the democratization of the family', 'B': 'putting an end to... \n", "14 {'A': 'Tax increase \\xa0\\xa0\\xa0 Demand rises \\xa0\\xa0\\xa0 Falling... \n", "15 {'A': 'ERG theory', 'B': 'expectancy theory', 'C': 'equity theory'... \n", "16 {'A': 'Low oxidative capacity', 'B': 'High fatigue resistance', 'C... \n", "17 {'A': 'ATP.', 'B': 'ADP.', 'C': 'phosphocreatine.', 'D': 'oxidativ... \n", "18 {'A': 'a currency.', 'B': 'a well-connected transportation infrast... \n", "19 {'A': 'case study', 'B': 'cross-sectional research', 'C': 'natural... \n", "20 {'A': 'Store of value', 'B': 'Unit of account', 'C': 'Medium of ex... \n", "21 {'A': 'Buying Treasury bonds in the open market', 'B': 'Raising th... \n", "22 {'A': 'The issues at stake in those elections were more important ... \n", "23 {'A': 'Withdrawn from the engagement because the entity has not be... \n", "24 {'A': 'Security is a socially constructed concept, relative to the... \n", "25 {'A': 'The final speed of object 1', 'B': 'The initial speed of ob... \n", "26 {'A': 'Portuguese explorers were convinced that the route to the N... \n", "27 {'A': 'The price level rises but real GDP falls.', 'B': 'Both the ... \n", "28 {'A': 'the Civil Rights Act of 1964', 'B': 'Medicare and Medicaid'... \n", "29 {'A': 'a computer program', 'B': 'a souffle', 'C': 'a brassiere', ... \n", "30 {'A': 'Shot gun sequencing', 'B': 'Cultivation in human neural cel... \n", "31 {'A': '9', 'B': '27', 'C': '81', 'D': '729'} \n", "32 {'A': 'Wrong, Wrong', 'B': 'Wrong, Not wrong', 'C': 'Not wrong, Wr... \n", "33 {'A': 'Passage of the Pure Food and Water Act', 'B': 'Creation of ... \n", "34 {'A': 'No, because sanctions are not proper against the attorney o... \n", "35 {'A': 'Gender identity', 'B': 'Sexual nomenclature', 'C': 'Gender ... \n", "36 {'A': '4,500,321', 'B': '4,524,491', 'C': '4,533,372', 'D': '4,535... \n", "37 {'A': 'speed', 'B': 'frequency', 'C': 'wavelength', 'D': 'All of t... \n", "38 {'A': 'Legal, Interactions, Mechanics', 'B': 'Civil, Relations, Ou... \n", "39 {'A': 'final and complete.', 'B': 'open-ended and revisable.', 'C'... \n", "\n", " example_answer \\\n", "0 A \n", "1 C \n", "2 A \n", "3 B \n", "4 C \n", "5 B \n", "6 C \n", "7 A \n", "8 D \n", "9 A \n", "10 C \n", "11 D \n", "12 A \n", "13 B \n", "14 C \n", "15 D \n", "16 B \n", "17 A \n", "18 D \n", "19 B \n", "20 C \n", "21 A \n", "22 B \n", "23 C \n", "24 A \n", "25 D \n", "26 D \n", "27 D \n", "28 D \n", "29 A \n", "30 C \n", "31 C \n", "32 C \n", "33 D \n", "34 B \n", "35 A \n", "36 C \n", "37 B \n", "38 B \n", "39 B \n", "\n", " reasoning \\\n", "0 To determine which scenario involves the main character doing some... \n", "1 To determine which of the following is not a TV cartoon duo, we ne... \n", "2 To answer this question, we need to understand the concept of mark... \n", "3 To answer this question, we need to consider the characteristics o... \n", "4 To determine which scenario involves the main character doing some... \n", "5 The police officer's actions, although intended to frighten the ma... \n", "6 The key issue here is whether the state statute revoking the insur... \n", "7 The concept described in the question involves immigrant children ... \n", "8 The Port Huron Statement, as excerpted, discusses themes of social... \n", "9 The question asks for an eighth-century CE female poet who is wors... \n", "10 To answer this question, we need to consider the context of physic... \n", "11 To answer this question, we need to consider the central arguments... \n", "12 When light hits an opaque material, it does not pass through becau... \n", "13 To answer this question, we need to consider the key components of... \n", "14 To address a recessionary gap and a budget deficit through fiscal ... \n", "15 The scenario describes a situation where a company president imple... \n", "16 To answer this question, we need to understand the characteristics... \n", "17 To answer this question, we need to understand the role of differe... \n", "18 To address this question, let's consider what subsistence economie... \n", "19 Developmental psychologists often aim to understand how individual... \n", "20 In this scenario, Kevin, Lisa, and Bob have different needs and pr... \n", "21 To answer this question, we need to understand the tools the Feder... \n", "22 To answer this question, we need to understand what is meant by \"c... \n", "23 When an accountant is engaged to review financial statements in ac... \n", "24 The Copenhagen School's view of security is rooted in the concept ... \n", "25 To determine the mass of the second object in a linear, elastic co... \n", "26 The Treaty of Tordesillas, signed in 1494, was an agreement betwee... \n", "27 When both short-run aggregate supply and aggregate demand increase... \n", "28 To answer this question, we need to analyze the given text and opt... \n", "29 To answer this question, we need to consider the historical contri... \n", "30 To detail new polyomaviruses, we must consider the methods used fo... \n", "31 To find the square root of 81 squared, we first calculate 81 squar... \n", "32 To determine which scenario involves the main character doing some... \n", "33 The Progressive Party Platform of 1912 emphasizes the importance o... \n", "34 The court is likely to grant the hospital's motion for sanctions b... \n", "35 To answer this question, we need to understand the concept of gend... \n", "36 To find the actual population of Louisiana in 2010, we need to con... \n", "37 The question is about the Doppler effect, which describes how the ... \n", "38 To solve this, let's break down the components of the sentence and... \n", "39 Aristotle's theory of the virtues, as outlined in his Nicomachean ... \n", "\n", " pred_answer metric \n", "0 A ✔️ [True] \n", "1 C ✔️ [True] \n", "2 A ✔️ [True] \n", "3 B ✔️ [True] \n", "4 C ✔️ [True] \n", "5 B ✔️ [True] \n", "6 C ✔️ [True] \n", "7 A ✔️ [True] \n", "8 D ✔️ [True] \n", "9 A ✔️ [True] \n", "10 C ✔️ [True] \n", "11 D ✔️ [True] \n", "12 A ✔️ [True] \n", "13 B ✔️ [True] \n", "14 C ✔️ [True] \n", "15 D ✔️ [True] \n", "16 B ✔️ [True] \n", "17 A ✔️ [True] \n", "18 B \n", "19 B ✔️ [True] \n", "20 C ✔️ [True] \n", "21 A ✔️ [True] \n", "22 B ✔️ [True] \n", "23 C ✔️ [True] \n", "24 A ✔️ [True] \n", "25 D ✔️ [True] \n", "26 D ✔️ [True] \n", "27 D ✔️ [True] \n", "28 D ✔️ [True] \n", "29 A ✔️ [True] \n", "30 C ✔️ [True] \n", "31 C ✔️ [True] \n", "32 C ✔️ [True] \n", "33 D ✔️ [True] \n", "34 C \n", "35 A ✔️ [True] \n", "36 C ✔️ [True] \n", "37 B ✔️ [True] \n", "38 C \n", "39 B ✔️ [True] " ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 64.1 ms, sys: 5.03 ms, total: 69.1 ms\n", "Wall time: 63.5 ms\n" ] } ], "source": [ "%%time\n", "score, results, all_scores = evaluate(\n", " optimized_program,\n", " devset=testset[:40],\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Medium Optimization" ] }, { "cell_type": "code", "execution_count": 95, "metadata": { "scrolled": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2025/01/29 23:05:39 INFO dspy.teleprompt.mipro_optimizer_v2: \n", "RUNNING WITH THE FOLLOWING MEDIUM AUTO RUN SETTINGS:\n", "num_trials: 25\n", "minibatch: True\n", "num_candidates: 25\n", "valset size: 300\n", "\n", "2025/01/29 23:05:39 INFO dspy.teleprompt.mipro_optimizer_v2: \n", "==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==\n", "2025/01/29 23:05:39 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used for informing instruction proposal.\n", "\n", "2025/01/29 23:05:39 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=25 sets of demonstrations...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Bootstrapping set 1/25\n", "Bootstrapping set 2/25\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 0%|▉ | 2/500 [00:08<35:10, 4.24s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.\n", "Bootstrapping set 3/25\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 1%|█▍ | 3/500 [00:00<00:00, 1029.78it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.\n", "Bootstrapping set 4/25\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 0%|▉ | 2/500 [00:09<40:22, 4.86s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Bootstrapped 1 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.\n", "Bootstrapping set 5/25\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 1%|█▍ | 3/500 [00:39<1:49:09, 13.18s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Bootstrapped 2 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.\n", "Bootstrapping set 6/25\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 0%|▍ | 1/500 [00:07<1:03:02, 7.58s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.\n", "Bootstrapping set 7/25\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 0%|▍ | 1/500 [00:05<46:14, 5.56s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.\n", "Bootstrapping set 8/25\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 0%|▍ | 1/500 [00:03<28:32, 3.43s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.\n", "Bootstrapping set 9/25\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 1%|█▍ | 3/500 [00:20<56:43, 6.85s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.\n", "Bootstrapping set 10/25\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 1%|█▍ | 3/500 [00:25<1:10:33, 8.52s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.\n", "Bootstrapping set 11/25\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 0%|▉ | 2/500 [00:09<38:41, 4.66s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.\n", "Bootstrapping set 12/25\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 0%|▉ | 2/500 [00:11<47:16, 5.70s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.\n", "Bootstrapping set 13/25\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 0%|▍ | 1/500 [00:08<1:14:39, 8.98s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.\n", "Bootstrapping set 14/25\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 0%|▉ | 2/500 [00:09<39:02, 4.70s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.\n", "Bootstrapping set 15/25\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 0%|▉ | 2/500 [00:11<46:20, 5.58s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.\n", "Bootstrapping set 16/25\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 1%|█▍ | 3/500 [00:23<1:04:49, 7.82s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.\n", "Bootstrapping set 17/25\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 0%|▍ | 1/500 [00:06<56:44, 6.82s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.\n", "Bootstrapping set 18/25\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 1%|█▍ | 3/500 [00:14<40:10, 4.85s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Bootstrapped 2 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.\n", "Bootstrapping set 19/25\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 1%|█▍ | 3/500 [00:19<54:38, 6.60s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.\n", "Bootstrapping set 20/25\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 0%|▉ | 2/500 [00:10<42:03, 5.07s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.\n", "Bootstrapping set 21/25\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 1%|█▉ | 4/500 [00:29<1:01:42, 7.46s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Bootstrapped 3 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.\n", "Bootstrapping set 22/25\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 1%|█▍ | 3/500 [00:22<1:02:47, 7.58s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.\n", "Bootstrapping set 23/25\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 0%|▉ | 2/500 [00:14<58:31, 7.05s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.\n", "Bootstrapping set 24/25\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 0%|▍ | 1/500 [00:04<39:09, 4.71s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.\n", "Bootstrapping set 25/25\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ " 0%|▍ | 1/500 [00:10<1:31:02, 10.95s/it]\n", "2025/01/29 23:11:07 INFO dspy.teleprompt.mipro_optimizer_v2: \n", "==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==\n", "2025/01/29 23:11:07 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2025/01/29 23:11:19 INFO dspy.teleprompt.mipro_optimizer_v2: \n", "Proposing instructions...\n", "\n", "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:\n", "\n", "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 0: You are a knowledgeable and insightful assistant designed to provide guidance on multiple-choice questions. Your role involves analyzing questions, assessing answer choices, and offering well-reasoned explanations to enhance understanding. By breaking down complex concepts, you help users develop critical thinking skills and improve their decision-making process. You strive to present information in a clear, structured manner while adapting to the user's level of expertise. Ultimately, your goal is to foster deeper comprehension and confidence in tackling multiple-choice assessments.\n", "\n", "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 1: To tackle a multiple-choice question effectively, carefully read and analyze the question stem, identifying key concepts and any specific details that might narrow down the possible answers. Next, assess each answer choice in relation to the question, considering the relevance, accuracy, and implications of each option. Provide a step-by-step breakdown of your reasoning process, ensuring that each step logically follows from the previous one and that you address any complexities or nuances of the question. In your explanation, define key terms, explain relevant concepts, and justify why certain options can be eliminated or why one option stands out as the correct answer. Throughout your response, maintain clarity and organization, using transitional phrases and clear headings to guide the reader through your thought process. Finally, conclude by stating the correct answer and summarizing the main points that led to this conclusion, reinforcing the reasoning and ensuring that the explanation is accessible to readers of various expertise levels.\n", "\n", "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 2: You are a skilled educator and analyst with expertise in multiple disciplines, including law, economics, physics, and general knowledge. Your role is to guide users through complex multiple-choice questions by providing detailed, step-by-step reasoning and explanations. You aim to enhance users' critical thinking skills, decision-making processes, and confidence in tackling assessments across various subjects. When approaching a question, consider the context, analyze each option carefully, and break down the reasoning into clear, structured steps. Adapt your explanations to cater to users with different levels of expertise, ensuring that your responses are informative, engaging, and easy to understand. By doing so, you will not only help users arrive at the correct answers but also foster a deeper understanding of the subject matter and improve their ability to approach similar questions independently.\n", "\n", "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 3: Act as an expert tutor guiding students through complex multiple-choice questions. Analyze each question carefully, considering all possible answer choices and their implications. Generate a clear, step-by-step explanation for why a particular answer is correct, focusing on breaking down difficult concepts into manageable parts. Ensure your reasoning is concise, yet detailed enough to illuminate the thought process behind selecting the correct answer. Remember, the goal is not just to provide the right answer, but to educate and enhance the student's understanding and critical thinking skills. Approach each question with the mindset of teaching a student how to think through the problem, rather than just giving them the solution. By doing so, you will help foster a deeper understanding of the subject matter and improve the student's ability to tackle similar questions independently.\n", "\n", "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 4: Analyze the given multiple-choice question and options, then provide a step-by-step reasoning process to arrive at the correct answer. Ensure the explanation is clear, structured, and adapted to the user's level of expertise, fostering deeper comprehension and confidence in tackling multiple-choice assessments.\n", "\n", "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 5: To tackle multiple-choice questions effectively, analyze the question stem to identify key concepts and issues. Then, evaluate each answer choice by considering its relevance, logical consistency, and alignment with the question's requirements. Break down complex concepts into manageable parts, and assess how each option addresses the question's core concerns. Provide a step-by-step reasoning process that justifies the selection of the correct answer, ensuring that the explanation is clear, concise, and tailored to the user's level of expertise. Ultimately, aim to enhance the user's critical thinking skills and confidence in approaching multiple-choice assessments by offering insightful guidance and fostering a deeper understanding of the subject matter.\n", "\n", "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 6: To provide a well-structured response, analyze the given multiple-choice question and options. Break down the question into key components, and then assess each option against these components. Offer a step-by-step reasoning process that leads to the selection of the correct answer. Ensure that the reasoning is clear, concise, and adapted to the user's level of expertise. The goal is to not only select the correct answer but to also explain why it is correct and why the other options are incorrect, thereby enhancing the user's understanding and critical thinking skills.\n", "\n", "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 7: Act as an expert tutor guiding students through complex multiple-choice questions. When presented with a question and a set of options, carefully analyze the query, evaluate each answer choice, and construct a detailed, step-by-step explanation that leads to the correct answer. Ensure your reasoning is clear, concise, and tailored to the user's level of understanding, promoting a deeper grasp of the subject matter and enhancing their ability to approach similar questions with confidence. Your explanation should not only justify the correct answer but also provide insight into why the other options are incorrect, thereby fostering critical thinking and analytical skills.\n", "\n", "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 8: You are a meticulous and analytical expert in multiple disciplines, including law, entertainment, and science, tasked with providing detailed explanations for multiple-choice questions. Your role involves carefully reading each question, evaluating the provided options, and selecting the most appropriate answer based on your knowledge and reasoning skills. For each question, you will provide a step-by-step breakdown of your thought process, explaining why you chose a particular answer and why the other options are incorrect. Your goal is to not only provide the correct answer but also to educate and guide users through the reasoning process, helping them develop their critical thinking skills and improve their ability to tackle complex multiple-choice assessments.\n", "\n", "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 9: To effectively utilize the language model for solving multiple-choice questions, provide the model with a clear and structured input that includes the question, the available options, and any relevant context or information. The model should then analyze the question, assess each option, and generate a step-by-step reasoning process to arrive at the correct answer. This process should be transparent, with the model explaining its thought process and the rationale behind its choice. The output should include the correct answer, along with a detailed explanation of how the model deduced this answer from the given options. The goal is to not only provide the correct answer but also to educate the user on the reasoning and critical thinking skills required to approach such questions. The model should adapt its explanations based on the user's level of expertise, ensuring that the information is presented in a clear and understandable manner. By doing so, the model will help users develop their critical thinking and analytical skills, enhancing their ability to tackle complex multiple-choice questions across various subjects.\n", "\n", "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 10: You are a skilled analyst and educator with expertise in critical thinking, analytical reasoning, and multiple-choice question strategy. Your role involves guiding users through complex questions, evaluating answer options, and providing detailed, step-by-step explanations to facilitate deeper understanding and improved decision-making skills. By adapting your approach to the user's level of expertise, you aim to enhance their ability to analyze information, identify relevant details, and select the most appropriate answer. When presented with a question, carefully consider the context, assess each option, and generate a well-reasoned explanation for the correct answer, ensuring that your response is clear, structured, and informative.\n", "\n", "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 11: To utilize the Predict module effectively, provide a clear and concise multiple-choice question along with the available options. Ensure the question is well-structured and relevant to the topic at hand, whether it pertains to legal, psychological, or any other domain. The Predict module is designed to analyze the question, evaluate the provided options, and generate a detailed, step-by-step reasoning process to arrive at the most appropriate answer. This process involves breaking down complex concepts into manageable parts, assessing the relevance and probative value of information, and applying principles from the respective field of study to support the conclusion. The goal is to not only select the correct answer but to foster a deeper understanding of the subject matter, enhance critical thinking skills, and promote confidence in tackling similar questions in the future. When formulating your question, consider the context, the principles or rules that apply, and how the options provided relate to the question's requirements. The Predict module will then use this information to produce a comprehensive explanation, making it an invaluable tool for educational purposes, professional development, and the refinement of analytical reasoning skills.\n", "\n", "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 12: You are a critical component of a high-stakes decision-making system, and your task is to analyze complex multiple-choice questions, assess the provided answer choices, and generate step-by-step reasoning to arrive at the correct answer. The questions span various disciplines, including economics, law, physics, and general knowledge, requiring you to be versatile and knowledgeable. Your explanations must be clear, structured, and adapted to the user's level of expertise, as the consequences of incorrect decisions could be significant. For instance, in a real-world scenario, a incorrect answer could lead to financial losses, legal repercussions, or even physical harm. Your goal is to provide accurate and well-reasoned answers, fostering deeper comprehension and confidence in users as they tackle challenging assessments. You must evaluate each question carefully, considering the context, the subject matter, and the implications of each possible answer choice. By doing so, you will help users develop critical thinking skills, improve their decision-making process, and ultimately make informed decisions in high-pressure situations.\n", "\n", "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 13: You are a skilled educator and subject matter expert, proficient in a wide range of disciplines, including economics, law, physics, and general knowledge. Your role is to analyze complex multiple-choice questions, evaluate the provided options, and generate detailed, step-by-step explanations to justify the correct answer. By doing so, you aim to enhance the user's understanding, foster critical thinking, and improve their ability to approach similar questions with confidence. When responding, please provide a clear and structured reasoning process, adapting your explanation to the user's level of expertise, and ensure that your answer includes the correct choice (A, B, C, D, etc.) along with a thorough justification for why it is the correct option.\n", "\n", "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 14: You are a skilled educator with expertise in a wide range of subjects, including economics, law, physics, and general knowledge. Your role is to guide students through multiple-choice questions by providing detailed, step-by-step explanations that enhance their understanding and critical thinking skills. When presented with a question and a set of answer options, analyze the question carefully, assess each option, and then offer a well-reasoned explanation for your answer choice. Ensure that your explanations are clear, structured, and adapted to the student's level of expertise. Your goal is to not only help students arrive at the correct answer but also to foster deeper comprehension and confidence in tackling multiple-choice assessments across various disciplines.\n", "\n", "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 15: To provide accurate and helpful responses to multiple-choice questions, analyze the question carefully, considering the subject matter and the specific concepts being tested. Evaluate each answer choice based on its relevance, accuracy, and alignment with the question's requirements. Offer step-by-step reasoning to justify the correct answer, breaking down complex concepts into clear and understandable parts. Ensure that the explanation is structured, easy to follow, and adapted to the user's level of expertise. The goal is to not only provide the correct answer but to also enhance the user's understanding of the subject matter, foster critical thinking skills, and build confidence in tackling similar questions in the future.\n", "\n", "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 16: To tackle multiple-choice questions effectively, I will employ a systematic approach that involves thoroughly analyzing the question, carefully evaluating each answer choice, and providing detailed, step-by-step reasoning to justify the correct answer. This process will not only help in selecting the right option but also in understanding the underlying concepts and principles. The goal is to break down complex questions into manageable parts, assess each component critically, and synthesize the information to arrive at a well-reasoned conclusion. By doing so, the aim is to enhance comprehension, foster critical thinking, and build confidence in addressing multiple-choice assessments across various disciplines.\n", "\n", "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 17: Act as an expert tutor guiding students through complex multiple-choice questions, providing detailed step-by-step reasoning for each answer choice. Your goal is to not only help students select the correct answer but also to enhance their understanding of the subject matter by breaking down intricate concepts into clear, manageable parts. As you analyze each question, consider the various disciplines and fields of study it may touch upon, such as economics, law, physics, or general knowledge, and tailor your explanations accordingly to foster a deeper and more interdisciplinary understanding. Ensure your reasoning is structured, easy to follow, and adapted to the student's level of expertise, promoting critical thinking, analytical reasoning, and confidence in tackling a wide range of multiple-choice assessments.\n", "\n", "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 18: You are a critical member of an elite task force responsible for analyzing complex multiple-choice questions and providing accurate, well-reasoned explanations to support your answers. Your team has been entrusted with a high-stakes mission to tackle a series of challenging assessments, and your performance will directly impact the success of the operation. Each question you encounter requires meticulous analysis, careful consideration of the answer choices, and the ability to articulate clear, step-by-step reasoning to justify your conclusions. Your goal is to demonstrate exceptional critical thinking skills, adapt to diverse subjects and question formats, and consistently deliver accurate, confidence-inspiring answers. The fate of the mission rests on your ability to think critically, solve problems effectively, and communicate complex ideas with clarity and precision. You must analyze the question, evaluate the options, and provide a well-reasoned explanation for your answer choice, ensuring that your response is concise, informative, and free of ambiguity.\n", "\n", "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 19: To address the task effectively, I propose the following instruction: \n", "\n", "\"Analyze the given question and options carefully, considering the context and any specific details provided. Break down complex concepts into manageable parts to assess each option's validity. Provide a step-by-step reasoning process to arrive at the correct answer, ensuring that the explanation is clear, structured, and adaptable to the user's level of expertise. The goal is to not only select the correct answer but also to enhance the user's understanding and critical thinking skills through a well-reasoned and detailed explanation.\n", "\n", "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 20: Act as an expert tutor guiding students through complex multiple-choice questions across various subjects, including economics, law, physics, and general knowledge. Your task is to analyze the question, evaluate the provided options, and generate a step-by-step reasoning process that leads to the correct answer. Ensure your explanations are clear, structured, and adaptable to different levels of expertise, aiming to enhance critical thinking, analytical reasoning, and decision-making skills. When approaching a question, consider the context, identify key concepts, and apply relevant principles or theories to deduce the most appropriate answer. Your goal is to not only provide the correct answer but also to educate and foster a deeper understanding of the subject matter, enabling students to tackle similar questions with confidence and accuracy.\n", "\n", "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 21: You are a critical component of an emergency response system designed to provide accurate and timely guidance on complex, high-stakes multiple-choice questions. In this scenario, a team of experts is racing against time to solve a series of critical problems, and their decisions will have far-reaching consequences. Your role is to analyze each question, assess the answer choices, and provide well-reasoned explanations to support the correct answer. The experts are relying on your guidance to make informed decisions, and the outcome of their efforts depends on the accuracy and clarity of your responses. You must break down complex concepts into clear, structured explanations, adapting your approach to the experts' level of expertise. Your goal is to empower the team to make confident, informed decisions, even in the most pressure-filled situations. You have the ability to ask for clarification or additional information if needed, but you must provide a final answer and explanation within a tight deadline. The fate of the mission rests on your ability to provide accurate and reliable guidance, so you must be meticulous, thorough, and clear in your analysis and explanations.\n", "\n", "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 22: To address the task effectively, I will analyze the provided question and options, breaking down complex concepts into clear, step-by-step reasoning. My goal is to select the correct answer and provide a detailed explanation that enhances understanding and fosters critical thinking skills. I will adapt my response to the user's level of expertise, ensuring the information is presented in a structured and accessible manner. For each question, I will evaluate the options based on relevant knowledge and logical deductions, clearly outlining my thought process to facilitate learning and confidence in tackling multiple-choice assessments.\n", "\n", "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 23: Analyze the given multiple-choice question and options, then provide a step-by-step reasoning process to determine the correct answer. Ensure the explanation is clear, structured, and adapted to the user's level of expertise, focusing on fostering deeper comprehension and confidence in tackling multiple-choice assessments.\n", "\n", "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 24: You are a legal expert and educator with extensive experience in evidence law and trial procedures. Your role is to guide students and professionals through complex multiple-choice questions related to legal evidence, ensuring they understand the underlying principles and can apply them to real-world scenarios. Given a question and a set of options, analyze the scenario carefully, considering the relevance, reliability, and potential prejudice of the evidence in question. Then, provide a step-by-step reasoning process that leads to the correct answer, explaining the legal concepts and rules of evidence that apply. Your explanations should be clear, concise, and tailored to the user's level of expertise, aiming to enhance their critical thinking and decision-making skills in the context of legal evidence and trial procedures.\n", "\n", "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: \n", "\n", "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: Evaluating the default program...\n", "\n", "2025/01/29 23:15:54 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'Compute the area of the triangle whose altitudes have lengths 10, 12, and 15.', 'options': {'A': '56\\\\sqrt{15}', 'B': '240\\\\sqrt{7}', 'C': '60\\\\sqrt{7}', 'D': '240\\\\sqrt{7}/7'}, 'answer': 'D', 'reasoning': \"## Step 1: Recall the relationship between the area of a triangle and its altitudes\\nThe area \\\\(A\\\\) of a triangle can be found using the formula \\\\(A = \\\\frac{1}{2}bh\\\\), where \\\\(b\\\\) is the base of the triangle and \\\\(h\\\\) is the height (or altitude) of the triangle. However, when all three altitudes are known, we can use the formula for the area involving the altitudes \\\\(h_1\\\\), \\\\(h_2\\\\), and \\\\(h_3\\\\), and the sides \\\\(a\\\\), \\\\(b\\\\), and \\\\(c\\\\) of the triangle, given by \\\\(A = \\\\frac{1}{2}h_1a = \\\\frac{1}{2}h_2b = \\\\frac{1}{2}h_3c\\\\), but a more direct approach involves using Heron's formula in conjunction with the relationship between the altitudes and the sides.\\n\\n## Step 2: Use the formula that relates the area of a triangle directly to its altitudes\\nThere's a formula that relates the area \\\\(A\\\\) of a triangle directly to its altitudes \\\\(h_1\\\\), \\\\(h_2\\\\), and \\\\(h_3\\\\), which is \\\\(A = \\\\sqrt{\\\\frac{h_1h_2h_3}{2}(h_1 + h_2 + h_3)}\\\\) for certain types of triangles, but this isn't universally applicable. A more universally applicable method involves understanding that the area can also be expressed in terms of its inradius \\\\(r\\\\) and semiperimeter \\\\(s\\\\), \\\\(A = rs\\\\), and the relationship between the altitudes and the sides can give us \\\\(r\\\\) through \\\\(r = \\\\frac{A}{s}\\\\) and the formula \\\\(A = \\\\sqrt{s(s-a)(s-b)(s-c)}\\\\), but we need a direct link between altitudes and area.\\n\\n## Step 3: Correct Approach - Use the relationship between the altitudes and the area through the inradius\\nFor any triangle with altitudes \\\\(h_1\\\\), \\\\(h_2\\\\), and \\\\(h_3\\\\), and corresponding sides \\\\(a\\\\), \\\\(b\\\\), and \\\\(c\\\\), the area \\\\(A\\\\) can also be expressed as \\\\(A = \\\\frac{1}{2} \\\\cdot a \\\\cdot h_1 = \\\\frac{1}{2} \\\\cdot b \\\\cdot h_2 = \\\\frac{1}{2} \\\\cdot c \\\\cdot h_3\\\\). However, to directly compute the area from the altitudes without knowing the sides, we consider the formula \\\\(A = \\\\sqrt{\\\\frac{h_1h_2h_3}{2}(h_1 + h_2 + h_3)}\\\\) is not standard; instead, we should focus on the relationship \\\\(A = rs\\\\) where \\\\(r\\\\) is the inradius and \\\\(s\\\\) the semiperimeter. The inradius can be found from the altitudes and the sides through the formula for the area, but without the sides, we look for a relationship that directly ties altitudes to area.\\n\\n## Step 4: Apply the correct formula for the area using altitudes\\nGiven the altitudes \\\\(h_1 = 10\\\\), \\\\(h_2 = 12\\\\), and \\\\(h_3 = 15\\\\), we need to apply a correct and relevant formula that ties these directly to the area. However, the direct calculation of the area from altitudes without the sides or angles involves understanding that \\\\(h_1 = \\\\frac{2A}{a}\\\\), \\\\(h_2 = \\\\frac{2A}{b}\\\\), and \\\\(h_3 = \\\\frac{2A}{c}\\\\), which implies \\\\(A = \\\\frac{1}{2}ah_1 = \\\\frac{1}{2}bh_2 = \\\\frac{1}{2}ch_3\\\\). To find \\\\(A\\\\) directly from \\\\(h_1\\\\), \\\\(h_2\\\\), and \\\\(h_3\\\\), consider the relationship of the altitudes to the area and the sides, but note that a direct formula might not be straightforward without additional relationships.\\n\\n## Step 5: Realize the necessity of a different approach\\nGiven the complexity and the potential for confusion in directly applying a formula without clear relevance, reconsider the basic principles of geometry and how they apply to triangles, specifically how altitudes relate to the area and the sides. The area of a triangle given its altitudes directly might not be straightforward without considering the properties of right triangles or the use of Heron's formula in a creative way.\\n\\n## Step 6: Reflect on Heron's Formula and its applicability\\nHeron's formula states that the area \\\\(A\\\\) of a triangle whose sides have lengths \\\\(a\\\\), \\\\(b\\\\), and \\\\(c\\\\) is \\\\(A = \\\\sqrt{s(s-a)(s-b)(s-c)}\\\\), where \\\\(s\\\\) is the semiperimeter of the triangle, \\\\(s = \\\\frac{a + b + c}{2}\\\\). However, without the sides, directly applying Heron's formula is not possible.\\n\\n## Step 7: Consider the relationship between altitudes, area, and the inradius\\nThe area \\\\(A\\\\) of a triangle is also equal to \\\\(rs\\\\), where \\\\(r\\\\) is the inradius and \\\\(s\\\\) is the semiperimeter. The inradius can be related to the altitudes and the area, but this requires knowledge of the sides or a relationship that ties the altitudes directly to the inradius and semiperimeter.\\n\\n## Step 8: Final Calculation Approach\\nGiven the altitudes 10, 12, and 15, and recognizing the limitations of the previous steps in directly calculating the area without additional information or a clear, direct formula, we should reconsider the geometric properties that relate altitudes to the area and sides of a triangle. However, a common approach to solve this problem involves using the formula for the area based on the altitudes and the sides they intersect, but this typically requires knowledge of the triangle's sides or angles.\\n\\nThe best answer is D.\"}) (input_keys={'options', 'question'}): Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']). Set `provide_traceback=True` to see the stack trace.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Average Metric: 271.00 / 299 (90.6%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [00:00<00:00, 2574.39it/s]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2025/01/29 23:15:54 INFO dspy.evaluate.evaluate: Average Metric: 271.0 / 300 (90.3%)\n", "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 90.33\n", "\n", "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: ==> STEP 3: FINDING OPTIMAL PROMPT PARAMETERS <==\n", "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: We will evaluate the program over a series of trials with different combinations of instructions and few-shot examples to find the optimal combination using Bayesian Optimization.\n", "\n", "/home/justinai/.conda/envs/prompt-migration/lib/python3.10/site-packages/optuna/_experimental.py:31: ExperimentalWarning: Argument ``multivariate`` is an experimental feature. The interface can change in the future.\n", " warnings.warn(\n", "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 1 / 25 ==\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Average Metric: 23.00 / 25 (92.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:13<00:00, 1.88it/s]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2025/01/29 23:16:07 INFO dspy.evaluate.evaluate: Average Metric: 23 / 25 (92.0%)\n", "2025/01/29 23:16:07 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 92.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 12'].\n", "2025/01/29 23:16:07 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0]\n", "2025/01/29 23:16:07 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33]\n", "2025/01/29 23:16:07 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n", "2025/01/29 23:16:07 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n", "\n", "\n", "2025/01/29 23:16:07 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 2 / 25 ==\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Average Metric: 24.00 / 25 (96.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:16<00:00, 1.49it/s]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2025/01/29 23:16:24 INFO dspy.evaluate.evaluate: Average Metric: 24 / 25 (96.0%)\n", "2025/01/29 23:16:24 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 96.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 1'].\n", "2025/01/29 23:16:24 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0]\n", "2025/01/29 23:16:24 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33]\n", "2025/01/29 23:16:24 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n", "2025/01/29 23:16:24 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n", "\n", "\n", "2025/01/29 23:16:24 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 3 / 25 ==\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Average Metric: 25.00 / 25 (100.0%): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:13<00:00, 1.90it/s]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2025/01/29 23:16:38 INFO dspy.evaluate.evaluate: Average Metric: 25 / 25 (100.0%)\n", "2025/01/29 23:16:38 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 100.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 14'].\n", "2025/01/29 23:16:38 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0]\n", "2025/01/29 23:16:38 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33]\n", "2025/01/29 23:16:38 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n", "2025/01/29 23:16:38 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n", "\n", "\n", "2025/01/29 23:16:38 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 4 / 25 ==\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Average Metric: 22.00 / 25 (88.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:13<00:00, 1.91it/s]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2025/01/29 23:16:51 INFO dspy.evaluate.evaluate: Average Metric: 22 / 25 (88.0%)\n", "2025/01/29 23:16:51 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 88.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 8'].\n", "2025/01/29 23:16:51 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0]\n", "2025/01/29 23:16:51 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33]\n", "2025/01/29 23:16:51 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n", "2025/01/29 23:16:51 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n", "\n", "\n", "2025/01/29 23:16:51 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 5 / 25 ==\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Average Metric: 22.00 / 25 (88.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:19<00:00, 1.29it/s]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2025/01/29 23:17:10 INFO dspy.evaluate.evaluate: Average Metric: 22 / 25 (88.0%)\n", "2025/01/29 23:17:10 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 88.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 13'].\n", "2025/01/29 23:17:10 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0]\n", "2025/01/29 23:17:10 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33]\n", "2025/01/29 23:17:10 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n", "2025/01/29 23:17:10 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n", "\n", "\n", "2025/01/29 23:17:10 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 6 / 25 ==\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Average Metric: 23.00 / 25 (92.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:17<00:00, 1.46it/s]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2025/01/29 23:17:27 INFO dspy.evaluate.evaluate: Average Metric: 23 / 25 (92.0%)\n", "2025/01/29 23:17:27 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 92.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 4'].\n", "2025/01/29 23:17:27 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0]\n", "2025/01/29 23:17:27 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33]\n", "2025/01/29 23:17:27 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n", "2025/01/29 23:17:27 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n", "\n", "\n", "2025/01/29 23:17:27 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 7 / 25 ==\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Average Metric: 21.00 / 25 (84.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:14<00:00, 1.69it/s]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2025/01/29 23:17:42 INFO dspy.evaluate.evaluate: Average Metric: 21 / 25 (84.0%)\n", "2025/01/29 23:17:42 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 84.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 10'].\n", "2025/01/29 23:17:42 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0]\n", "2025/01/29 23:17:42 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33]\n", "2025/01/29 23:17:42 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n", "2025/01/29 23:17:42 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n", "\n", "\n", "2025/01/29 23:17:42 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 8 / 25 ==\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Average Metric: 21.00 / 25 (84.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:14<00:00, 1.74it/s]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2025/01/29 23:17:57 INFO dspy.evaluate.evaluate: Average Metric: 21 / 25 (84.0%)\n", "2025/01/29 23:17:57 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 84.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 14'].\n", "2025/01/29 23:17:57 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0]\n", "2025/01/29 23:17:57 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33]\n", "2025/01/29 23:17:57 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n", "2025/01/29 23:17:57 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n", "\n", "\n", "2025/01/29 23:17:57 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 9 / 25 ==\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Average Metric: 23.00 / 25 (92.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:42<00:00, 1.68s/it]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2025/01/29 23:18:39 INFO dspy.evaluate.evaluate: Average Metric: 23 / 25 (92.0%)\n", "2025/01/29 23:18:39 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 92.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 10'].\n", "2025/01/29 23:18:39 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0]\n", "2025/01/29 23:18:39 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33]\n", "2025/01/29 23:18:39 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n", "2025/01/29 23:18:39 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n", "\n", "\n", "2025/01/29 23:18:39 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 10 / 25 ==\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Average Metric: 22.00 / 25 (88.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:14<00:00, 1.72it/s]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2025/01/29 23:18:54 INFO dspy.evaluate.evaluate: Average Metric: 22 / 25 (88.0%)\n", "2025/01/29 23:18:54 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 88.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 10'].\n", "2025/01/29 23:18:54 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0, 88.0]\n", "2025/01/29 23:18:54 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33]\n", "2025/01/29 23:18:54 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n", "2025/01/29 23:18:54 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n", "\n", "\n", "2025/01/29 23:18:54 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Full Eval 1 =====\n", "2025/01/29 23:18:54 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 96.0) from minibatch trials...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Average Metric: 205.00 / 230 (89.1%): 77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊ | 230/300 [01:07<00:15, 4.47it/s]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2025/01/29 23:20:01 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'Compute the area of the triangle whose altitudes have lengths 10, 12, and 15.', 'options': {'A': '56\\\\sqrt{15}', 'B': '240\\\\sqrt{7}', 'C': '60\\\\sqrt{7}', 'D': '240\\\\sqrt{7}/7'}, 'answer': 'D', 'reasoning': \"## Step 1: Recall the relationship between the area of a triangle and its altitudes\\nThe area \\\\(A\\\\) of a triangle can be found using the formula \\\\(A = \\\\frac{1}{2}bh\\\\), where \\\\(b\\\\) is the base of the triangle and \\\\(h\\\\) is the height (or altitude) of the triangle. However, when all three altitudes are known, we can use the formula for the area involving the altitudes \\\\(h_1\\\\), \\\\(h_2\\\\), and \\\\(h_3\\\\), and the sides \\\\(a\\\\), \\\\(b\\\\), and \\\\(c\\\\) of the triangle, given by \\\\(A = \\\\frac{1}{2}h_1a = \\\\frac{1}{2}h_2b = \\\\frac{1}{2}h_3c\\\\), but a more direct approach involves using Heron's formula in conjunction with the relationship between the altitudes and the sides.\\n\\n## Step 2: Use the formula that relates the area of a triangle directly to its altitudes\\nThere's a formula that relates the area \\\\(A\\\\) of a triangle directly to its altitudes \\\\(h_1\\\\), \\\\(h_2\\\\), and \\\\(h_3\\\\), which is \\\\(A = \\\\sqrt{\\\\frac{h_1h_2h_3}{2}(h_1 + h_2 + h_3)}\\\\) for certain types of triangles, but this isn't universally applicable. A more universally applicable method involves understanding that the area can also be expressed in terms of its inradius \\\\(r\\\\) and semiperimeter \\\\(s\\\\), \\\\(A = rs\\\\), and the relationship between the altitudes and the sides can give us \\\\(r\\\\) through \\\\(r = \\\\frac{A}{s}\\\\) and the formula \\\\(A = \\\\sqrt{s(s-a)(s-b)(s-c)}\\\\), but we need a direct link between altitudes and area.\\n\\n## Step 3: Correct Approach - Use the relationship between the altitudes and the area through the inradius\\nFor any triangle with altitudes \\\\(h_1\\\\), \\\\(h_2\\\\), and \\\\(h_3\\\\), and corresponding sides \\\\(a\\\\), \\\\(b\\\\), and \\\\(c\\\\), the area \\\\(A\\\\) can also be expressed as \\\\(A = \\\\frac{1}{2} \\\\cdot a \\\\cdot h_1 = \\\\frac{1}{2} \\\\cdot b \\\\cdot h_2 = \\\\frac{1}{2} \\\\cdot c \\\\cdot h_3\\\\). However, to directly compute the area from the altitudes without knowing the sides, we consider the formula \\\\(A = \\\\sqrt{\\\\frac{h_1h_2h_3}{2}(h_1 + h_2 + h_3)}\\\\) is not standard; instead, we should focus on the relationship \\\\(A = rs\\\\) where \\\\(r\\\\) is the inradius and \\\\(s\\\\) the semiperimeter. The inradius can be found from the altitudes and the sides through the formula for the area, but without the sides, we look for a relationship that directly ties altitudes to area.\\n\\n## Step 4: Apply the correct formula for the area using altitudes\\nGiven the altitudes \\\\(h_1 = 10\\\\), \\\\(h_2 = 12\\\\), and \\\\(h_3 = 15\\\\), we need to apply a correct and relevant formula that ties these directly to the area. However, the direct calculation of the area from altitudes without the sides or angles involves understanding that \\\\(h_1 = \\\\frac{2A}{a}\\\\), \\\\(h_2 = \\\\frac{2A}{b}\\\\), and \\\\(h_3 = \\\\frac{2A}{c}\\\\), which implies \\\\(A = \\\\frac{1}{2}ah_1 = \\\\frac{1}{2}bh_2 = \\\\frac{1}{2}ch_3\\\\). To find \\\\(A\\\\) directly from \\\\(h_1\\\\), \\\\(h_2\\\\), and \\\\(h_3\\\\), consider the relationship of the altitudes to the area and the sides, but note that a direct formula might not be straightforward without additional relationships.\\n\\n## Step 5: Realize the necessity of a different approach\\nGiven the complexity and the potential for confusion in directly applying a formula without clear relevance, reconsider the basic principles of geometry and how they apply to triangles, specifically how altitudes relate to the area and the sides. The area of a triangle given its altitudes directly might not be straightforward without considering the properties of right triangles or the use of Heron's formula in a creative way.\\n\\n## Step 6: Reflect on Heron's Formula and its applicability\\nHeron's formula states that the area \\\\(A\\\\) of a triangle whose sides have lengths \\\\(a\\\\), \\\\(b\\\\), and \\\\(c\\\\) is \\\\(A = \\\\sqrt{s(s-a)(s-b)(s-c)}\\\\), where \\\\(s\\\\) is the semiperimeter of the triangle, \\\\(s = \\\\frac{a + b + c}{2}\\\\). However, without the sides, directly applying Heron's formula is not possible.\\n\\n## Step 7: Consider the relationship between altitudes, area, and the inradius\\nThe area \\\\(A\\\\) of a triangle is also equal to \\\\(rs\\\\), where \\\\(r\\\\) is the inradius and \\\\(s\\\\) is the semiperimeter. The inradius can be related to the altitudes and the area, but this requires knowledge of the sides or a relationship that ties the altitudes directly to the inradius and semiperimeter.\\n\\n## Step 8: Final Calculation Approach\\nGiven the altitudes 10, 12, and 15, and recognizing the limitations of the previous steps in directly calculating the area without additional information or a clear, direct formula, we should reconsider the geometric properties that relate altitudes to the area and sides of a triangle. However, a common approach to solve this problem involves using the formula for the area based on the altitudes and the sides they intersect, but this typically requires knowledge of the triangle's sides or angles.\\n\\nThe best answer is D.\"}) (input_keys={'options', 'question'}): Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']). Set `provide_traceback=True` to see the stack trace.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Average Metric: 264.00 / 299 (88.3%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [01:55<00:00, 2.59it/s]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2025/01/29 23:20:50 INFO dspy.evaluate.evaluate: Average Metric: 264.0 / 300 (88.0%)\n", "2025/01/29 23:20:50 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0]\n", "2025/01/29 23:20:50 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n", "2025/01/29 23:20:50 INFO dspy.teleprompt.mipro_optimizer_v2: =======================\n", "2025/01/29 23:20:50 INFO dspy.teleprompt.mipro_optimizer_v2: \n", "\n", "2025/01/29 23:20:50 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 11 / 25 ==\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Average Metric: 24.00 / 25 (96.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:12<00:00, 1.94it/s]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2025/01/29 23:21:03 INFO dspy.evaluate.evaluate: Average Metric: 24 / 25 (96.0%)\n", "2025/01/29 23:21:03 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 96.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 23'].\n", "2025/01/29 23:21:03 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0, 88.0, 96.0]\n", "2025/01/29 23:21:03 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0]\n", "2025/01/29 23:21:03 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n", "2025/01/29 23:21:03 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n", "\n", "\n", "2025/01/29 23:21:03 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 12 / 25 ==\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Average Metric: 24.00 / 25 (96.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:00<00:00, 2624.00it/s]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2025/01/29 23:21:03 INFO dspy.evaluate.evaluate: Average Metric: 24 / 25 (96.0%)\n", "2025/01/29 23:21:03 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 96.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 1'].\n", "2025/01/29 23:21:03 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0, 88.0, 96.0, 96.0]\n", "2025/01/29 23:21:03 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0]\n", "2025/01/29 23:21:03 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n", "2025/01/29 23:21:03 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n", "\n", "\n", "2025/01/29 23:21:03 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 13 / 25 ==\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Average Metric: 21.00 / 25 (84.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:15<00:00, 1.63it/s]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2025/01/29 23:21:18 INFO dspy.evaluate.evaluate: Average Metric: 21 / 25 (84.0%)\n", "2025/01/29 23:21:18 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 84.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 2'].\n", "2025/01/29 23:21:18 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0, 88.0, 96.0, 96.0, 84.0]\n", "2025/01/29 23:21:18 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0]\n", "2025/01/29 23:21:18 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n", "2025/01/29 23:21:18 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n", "\n", "\n", "2025/01/29 23:21:18 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 14 / 25 ==\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Average Metric: 23.00 / 25 (92.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:12<00:00, 1.97it/s]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2025/01/29 23:21:31 INFO dspy.evaluate.evaluate: Average Metric: 23 / 25 (92.0%)\n", "2025/01/29 23:21:31 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 92.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 14'].\n", "2025/01/29 23:21:31 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0, 88.0, 96.0, 96.0, 84.0, 92.0]\n", "2025/01/29 23:21:31 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0]\n", "2025/01/29 23:21:31 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n", "2025/01/29 23:21:31 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n", "\n", "\n", "2025/01/29 23:21:31 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 15 / 25 ==\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Average Metric: 22.00 / 25 (88.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:13<00:00, 1.86it/s]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2025/01/29 23:21:44 INFO dspy.evaluate.evaluate: Average Metric: 22 / 25 (88.0%)\n", "2025/01/29 23:21:44 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 88.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 9'].\n", "2025/01/29 23:21:44 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0, 88.0, 96.0, 96.0, 84.0, 92.0, 88.0]\n", "2025/01/29 23:21:44 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0]\n", "2025/01/29 23:21:44 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n", "2025/01/29 23:21:44 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n", "\n", "\n", "2025/01/29 23:21:44 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 16 / 25 ==\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Average Metric: 22.00 / 25 (88.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:00<00:00, 2597.48it/s]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2025/01/29 23:21:44 INFO dspy.evaluate.evaluate: Average Metric: 22 / 25 (88.0%)\n", "2025/01/29 23:21:44 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 88.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 1'].\n", "2025/01/29 23:21:44 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0, 88.0, 96.0, 96.0, 84.0, 92.0, 88.0, 88.0]\n", "2025/01/29 23:21:44 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0]\n", "2025/01/29 23:21:44 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n", "2025/01/29 23:21:44 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n", "\n", "\n", "2025/01/29 23:21:44 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 17 / 25 ==\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Average Metric: 21.00 / 25 (84.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:41<00:00, 1.67s/it]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2025/01/29 23:22:26 INFO dspy.evaluate.evaluate: Average Metric: 21 / 25 (84.0%)\n", "2025/01/29 23:22:26 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 84.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 11'].\n", "2025/01/29 23:22:26 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0, 88.0, 96.0, 96.0, 84.0, 92.0, 88.0, 88.0, 84.0]\n", "2025/01/29 23:22:26 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0]\n", "2025/01/29 23:22:26 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n", "2025/01/29 23:22:26 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n", "\n", "\n", "2025/01/29 23:22:26 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 18 / 25 ==\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Average Metric: 22.00 / 25 (88.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:14<00:00, 1.72it/s]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2025/01/29 23:22:41 INFO dspy.evaluate.evaluate: Average Metric: 22 / 25 (88.0%)\n", "2025/01/29 23:22:41 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 88.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 5'].\n", "2025/01/29 23:22:41 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0, 88.0, 96.0, 96.0, 84.0, 92.0, 88.0, 88.0, 84.0, 88.0]\n", "2025/01/29 23:22:41 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0]\n", "2025/01/29 23:22:41 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n", "2025/01/29 23:22:41 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n", "\n", "\n", "2025/01/29 23:22:41 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 19 / 25 ==\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Average Metric: 20.00 / 25 (80.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:18<00:00, 1.36it/s] " ] }, { "name": "stderr", "output_type": "stream", "text": [ "2025/01/29 23:22:59 INFO dspy.evaluate.evaluate: Average Metric: 20 / 25 (80.0%)\n", "2025/01/29 23:22:59 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 80.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 17'].\n", "2025/01/29 23:22:59 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0, 88.0, 96.0, 96.0, 84.0, 92.0, 88.0, 88.0, 84.0, 88.0, 80.0]\n", "2025/01/29 23:22:59 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0]\n", "2025/01/29 23:22:59 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n", "2025/01/29 23:22:59 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n", "\n", "\n", "2025/01/29 23:22:59 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 20 / 25 ==\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Average Metric: 22.00 / 25 (88.0%): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:42<00:00, 1.72s/it]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2025/01/29 23:23:42 INFO dspy.evaluate.evaluate: Average Metric: 22 / 25 (88.0%)\n", "2025/01/29 23:23:42 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 88.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 22'].\n", "2025/01/29 23:23:42 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0, 88.0, 96.0, 96.0, 84.0, 92.0, 88.0, 88.0, 84.0, 88.0, 80.0, 88.0]\n", "2025/01/29 23:23:42 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0]\n", "2025/01/29 23:23:42 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n", "2025/01/29 23:23:42 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n", "\n", "\n", "2025/01/29 23:23:42 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Full Eval 2 =====\n", "2025/01/29 23:23:42 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 96.0) from minibatch trials...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Average Metric: 262.00 / 300 (87.3%): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [01:29<00:00, 3.36it/s]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2025/01/29 23:25:12 INFO dspy.evaluate.evaluate: Average Metric: 262 / 300 (87.3%)\n", "2025/01/29 23:25:12 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0, 87.33]\n", "2025/01/29 23:25:12 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n", "2025/01/29 23:25:12 INFO dspy.teleprompt.mipro_optimizer_v2: =======================\n", "2025/01/29 23:25:12 INFO dspy.teleprompt.mipro_optimizer_v2: \n", "\n", "2025/01/29 23:25:12 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 21 / 25 ==\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Average Metric: 21.00 / 25 (84.0%): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:40<00:00, 1.61s/it]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2025/01/29 23:25:52 INFO dspy.evaluate.evaluate: Average Metric: 21 / 25 (84.0%)\n", "2025/01/29 23:25:52 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 84.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 20'].\n", "2025/01/29 23:25:52 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0, 88.0, 96.0, 96.0, 84.0, 92.0, 88.0, 88.0, 84.0, 88.0, 80.0, 88.0, 84.0]\n", "2025/01/29 23:25:52 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0, 87.33]\n", "2025/01/29 23:25:52 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n", "2025/01/29 23:25:52 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n", "\n", "\n", "2025/01/29 23:25:52 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 22 / 25 ==\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Average Metric: 22.00 / 25 (88.0%): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:00<00:00, 2398.33it/s]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2025/01/29 23:25:52 INFO dspy.evaluate.evaluate: Average Metric: 22 / 25 (88.0%)\n", "2025/01/29 23:25:52 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 88.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 23'].\n", "2025/01/29 23:25:52 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0, 88.0, 96.0, 96.0, 84.0, 92.0, 88.0, 88.0, 84.0, 88.0, 80.0, 88.0, 84.0, 88.0]\n", "2025/01/29 23:25:52 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0, 87.33]\n", "2025/01/29 23:25:52 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n", "2025/01/29 23:25:52 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n", "\n", "\n", "2025/01/29 23:25:52 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 23 / 25 ==\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Average Metric: 23.00 / 25 (92.0%): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:39<00:00, 1.56s/it]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2025/01/29 23:26:32 INFO dspy.evaluate.evaluate: Average Metric: 23 / 25 (92.0%)\n", "2025/01/29 23:26:32 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 92.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 24'].\n", "2025/01/29 23:26:32 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0, 88.0, 96.0, 96.0, 84.0, 92.0, 88.0, 88.0, 84.0, 88.0, 80.0, 88.0, 84.0, 88.0, 92.0]\n", "2025/01/29 23:26:32 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0, 87.33]\n", "2025/01/29 23:26:32 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n", "2025/01/29 23:26:32 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n", "\n", "\n", "2025/01/29 23:26:32 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 24 / 25 ==\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Average Metric: 22.00 / 25 (88.0%): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:00<00:00, 2904.08it/s]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2025/01/29 23:26:32 INFO dspy.evaluate.evaluate: Average Metric: 22 / 25 (88.0%)\n", "2025/01/29 23:26:32 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 88.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 0'].\n", "2025/01/29 23:26:32 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0, 88.0, 96.0, 96.0, 84.0, 92.0, 88.0, 88.0, 84.0, 88.0, 80.0, 88.0, 84.0, 88.0, 92.0, 88.0]\n", "2025/01/29 23:26:32 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0, 87.33]\n", "2025/01/29 23:26:32 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n", "2025/01/29 23:26:32 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n", "\n", "\n", "2025/01/29 23:26:32 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 25 / 25 ==\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Average Metric: 22.00 / 25 (88.0%): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:43<00:00, 1.74s/it]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2025/01/29 23:27:15 INFO dspy.evaluate.evaluate: Average Metric: 22 / 25 (88.0%)\n", "2025/01/29 23:27:15 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 88.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 7'].\n", "2025/01/29 23:27:15 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0, 88.0, 96.0, 96.0, 84.0, 92.0, 88.0, 88.0, 84.0, 88.0, 80.0, 88.0, 84.0, 88.0, 92.0, 88.0, 88.0]\n", "2025/01/29 23:27:15 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0, 87.33]\n", "2025/01/29 23:27:15 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n", "2025/01/29 23:27:15 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n", "\n", "\n", "2025/01/29 23:27:15 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Full Eval 3 =====\n", "2025/01/29 23:27:15 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 92.0) from minibatch trials...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Average Metric: 267.00 / 300 (89.0%): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [01:30<00:00, 3.30it/s]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2025/01/29 23:28:46 INFO dspy.evaluate.evaluate: Average Metric: 267 / 300 (89.0%)\n", "2025/01/29 23:28:46 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0, 87.33, 89.0]\n", "2025/01/29 23:28:46 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n", "2025/01/29 23:28:46 INFO dspy.teleprompt.mipro_optimizer_v2: =======================\n", "2025/01/29 23:28:46 INFO dspy.teleprompt.mipro_optimizer_v2: \n", "\n", "2025/01/29 23:28:46 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 90.33!\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "CPU times: user 18.3 s, sys: 3.02 s, total: 21.3 s\n", "Wall time: 23min 6s\n" ] } ], "source": [ "%%time\n", "subset_size = 500\n", "optimizer = dspy.MIPROv2(\n", " metric=benchmark.metric,\n", " auto=\"medium\",\n", " num_threads=NUM_THREADS,\n", " task_model=TASK_MODEL,\n", " prompt_model=PROMPT_MODEL,\n", " max_bootstrapped_demos=0,\n", " max_labeled_demos=FEW_SHOTS,\n", ")\n", "\n", "optimized_program = optimizer.compile(\n", " program,\n", " trainset=trainset[:subset_size],\n", " valset=valset[:subset_size],\n", " requires_permission_to_run=False,\n", ")" ] }, { "cell_type": "code", "execution_count": 96, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "BEST PROMPT:\n", " You are a knowledgeable and insightful assistant designed to provide guidance on multiple-choice questions. Your role involves analyzing questions, assessing answer choices, and offering well-reasoned explanations to enhance understanding. By breaking down complex concepts, you help users develop critical thinking skills and improve their decision-making process. You strive to present information in a clear, structured manner while adapting to the user's level of expertise. Ultimately, your goal is to foster deeper comprehension and confidence in tackling multiple-choice assessments.\n" ] } ], "source": [ "print(\"BEST PROMPT:\\n\", optimized_program.signature.instructions)" ] }, { "cell_type": "code", "execution_count": 98, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "BEST EXAMPLES:\n", " []\n" ] } ], "source": [ "print(\"BEST EXAMPLES:\\n\", optimized_program.demos)" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "ename": "AttributeError", "evalue": "'ChainOfThought' object has no attribute 'predict'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[48], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m examples_json \u001b[38;5;241m=\u001b[39m [example\u001b[38;5;241m.\u001b[39mto_dict() \u001b[38;5;28;01mfor\u001b[39;00m example \u001b[38;5;129;01min\u001b[39;00m \u001b[43moptimized_program\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpredict\u001b[49m\u001b[38;5;241m.\u001b[39mdemos]\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mBEST EXAMPLES:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m, json\u001b[38;5;241m.\u001b[39mdumps(examples_json, indent\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m2\u001b[39m))\n", "\u001b[0;31mAttributeError\u001b[0m: 'ChainOfThought' object has no attribute 'predict'" ] } ], "source": [ "examples_json = [example.to_dict() for example in optimized_program.predict.demos]\n", "print(\"BEST EXAMPLES:\\n\", json.dumps(examples_json, indent=2))" ] }, { "cell_type": "code", "execution_count": 99, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Average Metric: 453.00 / 500 (90.6%): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:42<00:00, 4.90it/s]" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2025/01/29 23:32:56 INFO dspy.evaluate.evaluate: Average Metric: 453 / 500 (90.6%)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
questionoptionsexample_answerexample_reasoningpred_reasoningpred_answermetric
0For which of these two scenarios does the main character (who uses...{'A': 'Wrong, Wrong', 'B': 'Wrong, Not wrong', 'C': 'Not wrong, Wr...A## Step 1: Evaluate the first scenario In the first scenario, the ...To determine which scenario involves the main character doing some...A✔️ [True]
1Two lists, list1 and list2, contain the names of books found in tw...{'A': 'newList ← Combine (listl, list2)\\n newList ← Sort (newList)...ATo create newList, which contains the names of all books found in ...To create newList, which contains the names of all books found in ...A✔️ [True]
2How might the recent global economic crisis be viewed as a challen...{'A': 'Governmental cyberspace restrictions, in the form of censor...C## Step 1: Understanding the liberalist perspective The liberalist...The liberalist perspective emphasizes the importance of free marke...C✔️ [True]
3What term is used to represent unavoidable past costs that cannot ...{'A': 'Prime costs', 'B': 'Sunk costs', 'C': 'Opportunity costs', ...B## Step 1: Understanding the concept of sunk costs Sunk costs are ...To answer this question, we need to understand the concept of each...B✔️ [True]
4Markson Co. traded a concrete-mixing truck with a book value of $1...{'A': 'Does the book value of the asset given up exceed the fair v...C## Step 1: Understand the concept of commercial substance in asset...To determine whether an exchange of assets has commercial substanc...C✔️ [True]
........................
495A man is charged with murder. During the trial, defense counsel of...{'A': 'not hearsay.', 'B': 'hearsay, but admissible as an admissio...DTo answer this question, let's break it down step by step: ## Step...The testimony in question involves a statement made by a man on de...D✔️ [True]
496Two men held-up a liquor store in a city. During the robbery, one ...{'A': 'granted, because the prosecutor is constitutionally require...BTo answer this question, we need to consider the legal implication...The defendant's motion to dismiss the indictment due to the delay ...C
497Which vitamins are important in lowering circulating homocysteine ...{'A': 'Vitamin D', 'B': 'Vitamin C', 'C': 'Vitamin A', 'D': 'Folat...D## Step 1: Understanding the role of vitamins in homocysteine leve...To answer this question, we need to consider the role of vitamins ...D✔️ [True]
498This question refers to the following information. \"The greatest c...{'A': 'African nations will not achieve independence without unity...D## Step 1: Understand the context of Nkrumah's statement Nkrumah e...To answer this question, we need to understand the context and the...D✔️ [True]
499Millions of immigrant children who entered the United States learn...{'A': 'Acculturation', 'B': 'Collective behavior', 'C': 'Social st...A## Step 1: Understanding the concept of acculturation Acculturatio...The concept described in the question involves immigrant children ...A✔️ [True]
\n", "

500 rows × 7 columns

\n", "
" ], "text/plain": [ " question \\\n", "0 For which of these two scenarios does the main character (who uses... \n", "1 Two lists, list1 and list2, contain the names of books found in tw... \n", "2 How might the recent global economic crisis be viewed as a challen... \n", "3 What term is used to represent unavoidable past costs that cannot ... \n", "4 Markson Co. traded a concrete-mixing truck with a book value of $1... \n", ".. ... \n", "495 A man is charged with murder. During the trial, defense counsel of... \n", "496 Two men held-up a liquor store in a city. During the robbery, one ... \n", "497 Which vitamins are important in lowering circulating homocysteine ... \n", "498 This question refers to the following information. \"The greatest c... \n", "499 Millions of immigrant children who entered the United States learn... \n", "\n", " options \\\n", "0 {'A': 'Wrong, Wrong', 'B': 'Wrong, Not wrong', 'C': 'Not wrong, Wr... \n", "1 {'A': 'newList ← Combine (listl, list2)\\n newList ← Sort (newList)... \n", "2 {'A': 'Governmental cyberspace restrictions, in the form of censor... \n", "3 {'A': 'Prime costs', 'B': 'Sunk costs', 'C': 'Opportunity costs', ... \n", "4 {'A': 'Does the book value of the asset given up exceed the fair v... \n", ".. ... \n", "495 {'A': 'not hearsay.', 'B': 'hearsay, but admissible as an admissio... \n", "496 {'A': 'granted, because the prosecutor is constitutionally require... \n", "497 {'A': 'Vitamin D', 'B': 'Vitamin C', 'C': 'Vitamin A', 'D': 'Folat... \n", "498 {'A': 'African nations will not achieve independence without unity... \n", "499 {'A': 'Acculturation', 'B': 'Collective behavior', 'C': 'Social st... \n", "\n", " example_answer \\\n", "0 A \n", "1 A \n", "2 C \n", "3 B \n", "4 C \n", ".. ... \n", "495 D \n", "496 B \n", "497 D \n", "498 D \n", "499 A \n", "\n", " example_reasoning \\\n", "0 ## Step 1: Evaluate the first scenario In the first scenario, the ... \n", "1 To create newList, which contains the names of all books found in ... \n", "2 ## Step 1: Understanding the liberalist perspective The liberalist... \n", "3 ## Step 1: Understanding the concept of sunk costs Sunk costs are ... \n", "4 ## Step 1: Understand the concept of commercial substance in asset... \n", ".. ... \n", "495 To answer this question, let's break it down step by step: ## Step... \n", "496 To answer this question, we need to consider the legal implication... \n", "497 ## Step 1: Understanding the role of vitamins in homocysteine leve... \n", "498 ## Step 1: Understand the context of Nkrumah's statement Nkrumah e... \n", "499 ## Step 1: Understanding the concept of acculturation Acculturatio... \n", "\n", " pred_reasoning \\\n", "0 To determine which scenario involves the main character doing some... \n", "1 To create newList, which contains the names of all books found in ... \n", "2 The liberalist perspective emphasizes the importance of free marke... \n", "3 To answer this question, we need to understand the concept of each... \n", "4 To determine whether an exchange of assets has commercial substanc... \n", ".. ... \n", "495 The testimony in question involves a statement made by a man on de... \n", "496 The defendant's motion to dismiss the indictment due to the delay ... \n", "497 To answer this question, we need to consider the role of vitamins ... \n", "498 To answer this question, we need to understand the context and the... \n", "499 The concept described in the question involves immigrant children ... \n", "\n", " pred_answer metric \n", "0 A ✔️ [True] \n", "1 A ✔️ [True] \n", "2 C ✔️ [True] \n", "3 B ✔️ [True] \n", "4 C ✔️ [True] \n", ".. ... ... \n", "495 D ✔️ [True] \n", "496 C \n", "497 D ✔️ [True] \n", "498 D ✔️ [True] \n", "499 A ✔️ [True] \n", "\n", "[500 rows x 7 columns]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 4.91 s, sys: 587 ms, total: 5.49 s\n", "Wall time: 1min 42s\n" ] } ], "source": [ "%%time\n", "score, results, all_scores = evaluate(\n", " optimized_program,\n", " devset=testset[:500],\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Heavy Optimization" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "MAX_BOOTSTRAPPED_DEMOS = 5\n", "MAX_LABELED_DEMOS = 5\n", "OPTIMIZER_MODE = \"heavy\"\n", "optimizer = dspy.MIPROv2(\n", " metric=benchmark.metric,\n", " auto=OPTIMIZER_MODE,\n", " num_threads=NUM_THREADS,\n", " task_model=TASK_MODEL,\n", " prompt_model=PROMPT_MODEL,\n", " max_labeled_demos=MAX_LABELED_DEMOS,\n", " max_bootstrapped_demos=MAX_BOOTSTRAPPED_DEMOS,\n", " max_errors\n", "\n", ")\n", "\n", "optimized_program = optimizer.compile(\n", " program,\n", " trainset=trainset,\n", " valset=valset,\n", " program_aware_proposer=False,\n", " requires_permission_to_run=False,\n", ")" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "BEST PROMPT:\n", " self = Predict(StringSignature(question, options -> reasoning, answer\n", " instructions='You are a helpful assistant.'\n", " question = Field(annotation=str required=True json_schema_extra={'desc': 'The question to be answered', '__dspy_field_type': 'input', 'prefix': 'Question:'})\n", " options = Field(annotation=dict required=True json_schema_extra={'desc': 'Dictionary of answer choices', '__dspy_field_type': 'input', 'prefix': 'Options:'})\n", " reasoning = Field(annotation=str required=True json_schema_extra={'prefix': \"Reasoning: Let's think step by step in order to\", 'desc': '${reasoning}', '__dspy_field_type': 'output'})\n", " answer = Field(annotation=str required=True json_schema_extra={'desc': 'The correct answer letter', '__dspy_field_type': 'output', 'prefix': 'Answer:'})\n", "))\n" ] } ], "source": [ "print(\"BEST PROMPT:\\n\", optimized_program)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"BEST EXAMPLES:\\n\", optimized_program.predict.demos)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "score, results, all_scores = evaluate(\n", " optimized_program,\n", " devset=testset,\n", " display_table=False,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.16" } }, "nbformat": 4, "nbformat_minor": 4 }