пре 1 година · e52e1d1ab4
--- a/end-to-end-use-cases/prompt-migration/README.md
+++ b/end-to-end-use-cases/prompt-migration/README.md
@@ -0,0 +1,8 @@
 
																+# prompt-migrator
															
 
																+
															
 
																+First, install Rye: https://rye.astral.sh/guide/installation/
															
 
																+
															
 
																+```
															
 
																+rye sync
															
 
																+. .venv/bin/activate
															
 
																+```
															
--- a/end-to-end-use-cases/prompt-migration/prompt_migration/__init__.py
+++ b/end-to-end-use-cases/prompt-migration/prompt_migration/__init__.py
--- a/end-to-end-use-cases/prompt-migration/benchmarks/datatypes.py
+++ b/end-to-end-use-cases/prompt-migration/benchmarks/datatypes.py
@@ -0,0 +1,10 @@
 
																+import typing as t
															
 
																+
															
 
																+if t.TYPE_CHECKING:
															
 
																+    import dspy
															
 
																+
															
 
																+
															
 
																+class TaskDatasets(t.NamedTuple):
															
 
																+    trainset: t.Iterable["dspy.Example"]
															
 
																+    valset: t.Iterable["dspy.Example"]
															
 
																+    testset: t.Iterable["dspy.Example"]
															
--- a/end-to-end-use-cases/prompt-migration/benchmarks/helpers.py
+++ b/end-to-end-use-cases/prompt-migration/benchmarks/helpers.py
@@ -0,0 +1,25 @@
 
																+import typing as t
															
 
																+
															
 
																+from .datatypes import TaskDatasets
															
 
																+
															
 
																+if t.TYPE_CHECKING:
															
 
																+    from datasets import Dataset
															
 
																+    import dspy
															
 
																+
															
 
																+
															
 
																+def train_val_test_split(
															
 
																+    dataset: "Dataset",
															
 
																+    mapper: t.Callable[[dict], "dspy.Example"],
															
 
																+    train_size: float = 0.1,
															
 
																+    validation_size: float = 0.2,
															
 
																+) -> TaskDatasets:
															
 
																+    docs = dataset.train_test_split(train_size=train_size)
															
 
																+    train_docs = docs["train"]
															
 
																+    docs = docs["test"].train_test_split(train_size=validation_size)
															
 
																+    validation_docs = docs["train"]
															
 
																+    test_docs = docs["test"]
															
 
																+    return TaskDatasets(
															
 
																+        trainset=list(map(mapper, train_docs)),
															
 
																+        valset=list(map(mapper, validation_docs)),
															
 
																+        testset=list(map(mapper, test_docs)),
															
 
																+    )
															
--- a/end-to-end-use-cases/prompt-migration/benchmarks/leaderboard_mmlu_pro.py
+++ b/end-to-end-use-cases/prompt-migration/benchmarks/leaderboard_mmlu_pro.py
@@ -0,0 +1,61 @@
 
																+import typing as t
															
 
																+
															
 
																+from datasets import load_dataset
															
 
																+import dspy
															
 
																+
															
 
																+from .datatypes import TaskDatasets
															
 
																+from .helpers import train_val_test_split
															
 
																+
															
 
																+
															
 
																+def signature(instructions: str = "") -> dspy.Signature:
															
 
																+    class MMLUPro(dspy.Signature):
															
 
																+        __doc__ = instructions
															
 
																+        question: str = dspy.InputField()
															
 
																+        options: list[str] = dspy.InputField()
															
 
																+        answer: str = dspy.OutputField()
															
 
																+
															
 
																+    return MMLUPro
															
 
																+
															
 
																+
															
 
																+def metric(gold: dspy.Example, pred: dspy.Example, trace=False) -> bool:
															
 
																+    return gold.answer == pred.answer
															
 
																+
															
 
																+
															
 
																+def datasets(
															
 
																+    train_size: float = 0.1,
															
 
																+    validation_size: float = 0.2,
															
 
																+) -> TaskDatasets:
															
 
																+    dataset = load_dataset("TIGER-Lab/MMLU-Pro")
															
 
																+    return train_val_test_split(
															
 
																+        dataset["test"], _task_doc_example, train_size, validation_size
															
 
																+    )
															
 
																+
															
 
																+
															
 
																+class TaskDoc(t.TypedDict):
															
 
																+    question_id: int
															
 
																+    question: str
															
 
																+    options: list[str]
															
 
																+    answer: str
															
 
																+    answer_index: int
															
 
																+    cot_content: str
															
 
																+    category: str
															
 
																+    src: str
															
 
																+
															
 
																+
															
 
																+inputs = ["question", "options"]
															
 
																+outputs = ["answer"]
															
 
																+
															
 
																+
															
 
																+def _num_letter(n: int) -> str:
															
 
																+    return chr(ord("A") + n)
															
 
																+
															
 
																+
															
 
																+def _task_doc_example(doc: TaskDoc) -> dspy.Example:
															
 
																+    question = doc["question"]
															
 
																+    options = [f"{_num_letter(i)}. {option}" for i, option in enumerate(doc["options"])]
															
 
																+    answer = doc["answer"]
															
 
																+    return dspy.Example(
															
 
																+        question=question,
															
 
																+        options=options,
															
 
																+        answer=answer,
															
 
																+    ).with_inputs(*inputs)
															
--- a/end-to-end-use-cases/prompt-migration/benchmarks/llama_mmlu_pro.py
+++ b/end-to-end-use-cases/prompt-migration/benchmarks/llama_mmlu_pro.py
@@ -0,0 +1,48 @@
 
																+import typing as t
															
 
																+
															
 
																+from datasets import load_dataset
															
 
																+import dspy
															
 
																+
															
 
																+from .datatypes import TaskDatasets
															
 
																+from .helpers import train_val_test_split
															
 
																+
															
 
																+
															
 
																+def datasets(
															
 
																+    train_size: float = 0.1,
															
 
																+    validation_size: float = 0.2,
															
 
																+) -> TaskDatasets:
															
 
																+    """
															
 
																+    TODO:
															
 
																+    Load dataset, dataset should be datasets.Dataset type (NOT DatasetDict, OR split the dataset yourself how you want)
															
 
																+    """
															
 
																+    dataset = load_dataset("TODO")
															
 
																+    return train_val_test_split(dataset, _task_doc_example, train_size, validation_size)
															
 
																+
															
 
																+
															
 
																+class TaskDoc(t.TypedDict):
															
 
																+    problem: str
															
 
																+    gold: str
															
 
																+
															
 
																+
															
 
																+inputs = ["problem"]
															
 
																+outputs = ["answer"]
															
 
																+
															
 
																+
															
 
																+def _task_doc_example(doc: TaskDoc) -> dspy.Example:
															
 
																+    return dspy.Example(
															
 
																+        problem=doc["problem"],
															
 
																+        answer=doc["gold"],
															
 
																+    ).with_inputs(*inputs)
															
 
																+
															
 
																+
															
 
																+def signature(instructions: str = "") -> dspy.Signature:
															
 
																+    class MMLUPro(dspy.Signature):
															
 
																+        __doc__ = instructions
															
 
																+        problem: str = dspy.InputField()
															
 
																+        answer: str = dspy.OutputField()
															
 
																+
															
 
																+    return MMLUPro
															
 
																+
															
 
																+
															
 
																+def metric(gold: dspy.Example, pred: dspy.Example, trace=False) -> bool:
															
 
																+    return gold.answer == pred.answer
															
--- a/end-to-end-use-cases/prompt-migration/environment.yml
+++ b/end-to-end-use-cases/prompt-migration/environment.yml
@@ -2,12 +2,21 @@ name: prompt-migration
 
																 channels:
															
 
																   - defaults
															
 
																   - pytorch
															
 
																+  - conda-forge
															
 
																 dependencies:
															
 
																-  - python=3.9
															
 
																+  - python=3.10  # Updated to match pyproject.toml requires-python
															
 
																   - pip
															
 
																+  - numpy<2  # Matches pyproject.toml dependency
															
 
																   - pip:
															
 
																-    - dspy-ai
															
 
																-    - torch
															
 
																-    - transformers
															
 
																-    - openai
															
 
																-    - databricks-sdk 
															
 
																+      - dspy @ git+ssh://git@github.com/stanfordnlp/dspy.git
															
 
																+      - lm-eval[wandb,api,math,ifeval,sentencepiece]>=0.4.7
															
 
																+      - python-dotenv>=1.0.1
															
 
																+      - ipdb>=0.13.13
															
 
																+      - ipython>=8.31.0
															
 
																+      - pytest>=8.3.4
															
 
																+      - ruff>=0.9.1
															
 
																+      - ipykernel>=6.29.5
															
 
																+      - torch
															
 
																+      - transformers
															
 
																+      - openai
															
 
																+      - databricks-sdk
															
--- a/end-to-end-use-cases/prompt-migration/examples/usage.py
+++ b/end-to-end-use-cases/prompt-migration/examples/usage.py
@@ -1,36 +0,0 @@
 
																-import dspy
															
 
																-from prompt_migration.engine import PromptMigrationEngine, PromptTemplate
															
 
																-from prompt_migration.evaluator import PromptEvaluator
															
 
																-
															
 
																-# Initialize LMs
															
 
																-openai_lm = dspy.OpenAI(model="gpt-3.5-turbo")
															
 
																-target_lm = dspy.HFModel(model="gpt2")
															
 
																-
															
 
																-# Create migration engine
															
 
																-engine = PromptMigrationEngine(openai_lm, target_lm)
															
 
																-
															
 
																-# Define source prompt
															
 
																-source_prompt = PromptTemplate(
															
 
																-    template="Summarize the following text: {text}",
															
 
																-    input_variables=["text"],
															
 
																-    model_type="openai"
															
 
																-)
															
 
																-
															
 
																-eval_dataset = [
															
 
																-    {"text": "Example text 1", "expected_answer": "Summary 1"},
															
 
																-    {"text": "Example text 2", "expected_answer": "Summary 2"},
															
 
																-]
															
 
																-
															
 
																-# Migrate prompt
															
 
																-migrated_prompt = engine.migrate_prompt(source_prompt, eval_dataset)
															
 
																-
															
 
																-# Evaluate migration
															
 
																-evaluator = PromptEvaluator(openai_lm, target_lm)
															
 
																-metrics = evaluator.evaluate(
															
 
																-    source_prompt.template,
															
 
																-    migrated_prompt.template,
															
 
																-    eval_dataset
															
 
																-)
															
 
																-
															
 
																-print(f"Migrated prompt: {migrated_prompt.template}")
															
 
																-print(f"Evaluation metrics: {metrics}") 
															
--- a/end-to-end-use-cases/prompt-migration/harness.ipynb
+++ b/end-to-end-use-cases/prompt-migration/harness.ipynb
--- a/end-to-end-use-cases/prompt-migration/main.py
+++ b/end-to-end-use-cases/prompt-migration/main.py
@@ -1,103 +0,0 @@
 
																-import dspy
															
 
																-from prompt_migration.engine import PromptMigrationEngine, PromptTemplate
															
 
																-from prompt_migration.evaluator import PromptEvaluator
															
 
																-from prompt_migration.eval_dataset import get_evaluation_dataset, get_eval_subset
															
 
																-
															
 
																-import os
															
 
																-import dotenv
															
 
																-
															
 
																-dotenv.load_dotenv()
															
 
																-
															
 
																-def main():
															
 
																-    openai_lm = dspy.LM(
															
 
																-        model="gpt-3.5-turbo",
															
 
																-        api_key=os.getenv("OPENAI_API_KEY")
															
 
																-    )
															
 
																-    
															
 
																-    target_lm = dspy.LM(
															
 
																-        model="together_ai/meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
															
 
																-        api_key=os.getenv("TOGETHER_API_KEY")
															
 
																-    )
															
 
																-    # To run it with ollama
															
 
																-    # target_lm = dspy.LM('ollama_chat/llama3.2:3b-instruct-fp16', api_base='http://localhost:11434', api_key='')
															
 
																-
															
 
																-    # To run it with huggingface
															
 
																-    # target_lm = dspy.HFModel(model="gpt2")
															
 
																-    
															
 
																-    engine = PromptMigrationEngine(openai_lm, target_lm)
															
 
																-    
															
 
																-    source_prompt = PromptTemplate(
															
 
																-        template="""You are an advanced Large Language Model tasked with generating Python code snippets in response to user prompts. Your primary objective is to provide accurate, concise, and well-structured Python functions. Follow these guidelines:
															
 
																-
															
 
																-    Understand the Context: Analyze the input prompt and identify its category (e.g., API Usage, File Handling, Error Handling).
															
 
																-
															
 
																-    Generate Code:
															
 
																-        Write Python code that directly addresses the user's request.
															
 
																-        Ensure the code is syntactically correct, functional, and adheres to Python best practices.
															
 
																-        Include necessary imports and handle potential edge cases.
															
 
																-
															
 
																-    Error Handling:
															
 
																-        Include appropriate error handling where applicable (e.g., try-except blocks).
															
 
																-        If exceptions occur, provide meaningful error messages.
															
 
																-
															
 
																-    Readability:
															
 
																-        Use clear variable names and include comments where necessary for clarity.
															
 
																-        Prioritize readability and maintainability in all generated code.
															
 
																-
															
 
																-    Complexity Alignment:
															
 
																-        Tailor the code's complexity based on the indicated difficulty (e.g., simple, medium, complex).
															
 
																-        Ensure that the solution is neither overly simplistic nor unnecessarily complicated.
															
 
																-
															
 
																-    Prompt Type:
															
 
																-        Focus on the code_generation type for creating Python functions.
															
 
																-        Avoid deviating from the task unless additional clarification is requested.
															
 
																-
															
 
																-    Testing and Validity:
															
 
																-        Assume the function might be run immediately. Provide code that is ready for use or minimal adaptation.
															
 
																-        Highlight any dependencies or external libraries required.
															
 
																-        """,
															
 
																-        input_variables=["text"],
															
 
																-        model_type="openai"
															
 
																-    )
															
 
																-    
															
 
																-    eval_dataset = get_evaluation_dataset()
															
 
																-
															
 
																-
															
 
																-    # To evaluate on a specific subset, use the following:
															
 
																-    code_generation_dataset = get_eval_subset(prompt_type="code_generation")
															
 
																-    #simple_tasks = get_eval_subset(complexity="simple")
															
 
																-    evaluator = PromptEvaluator(openai_lm, target_lm)
															
 
																-
															
 
																-    metrics = evaluator.evaluate(
															
 
																-        source_prompt.template,  # Same prompt for both
															
 
																-        source_prompt.template,  # Same prompt for both
															
 
																-        code_generation_dataset
															
 
																-    )
															
 
																-    
															
 
																-    print(f"Evaluation metrics:")
															
 
																-    print(f"  Accuracy: {metrics.accuracy:.2f}")
															
 
																-    print(f"  Similarity: {metrics.similarity:.2f}")
															
 
																-    print(f"  Consistency: {metrics.consistency:.2f}")
															
 
																-    
															
 
																-    # Migrate prompt
															
 
																-    print("Migrating prompt...")
															
 
																-    migrated_prompt = engine.migrate_prompt(source_prompt, code_generation_dataset)
															
 
																-    
															
 
																-    # Evaluate migration
															
 
																-    print("Evaluating migration...")
															
 
																-    metrics = evaluator.evaluate(
															
 
																-        source_prompt.template,
															
 
																-        migrated_prompt.template,
															
 
																-        code_generation_dataset
															
 
																-    )
															
 
																-    
															
 
																-    print(f"\nResults:")
															
 
																-    print(f"Original prompt: {source_prompt.template}")
															
 
																-    print(f"Migrated prompt: {migrated_prompt.template}")
															
 
																-    print(f"Evaluation metrics:")
															
 
																-    print(f"  Accuracy: {metrics.accuracy:.2f}")
															
 
																-    print(f"  Similarity: {metrics.similarity:.2f}")
															
 
																-    print(f"  Consistency: {metrics.consistency:.2f}")
															
 
																-
															
 
																-if __name__ == "__main__":
															
 
																-    main() 
															
--- a/end-to-end-use-cases/prompt-migration/notebooks/harness.ipynb
+++ b/end-to-end-use-cases/prompt-migration/notebooks/harness.ipynb
--- a/end-to-end-use-cases/prompt-migration/prompt_migration/engine.py
+++ b/end-to-end-use-cases/prompt-migration/prompt_migration/engine.py
@@ -1,106 +0,0 @@
 
																-import dspy
															
 
																-from typing import List, Dict, Optional
															
 
																-from dataclasses import dataclass
															
 
																-
															
 
																-@dataclass
															
 
																-class PromptTemplate:
															
 
																-    template: str
															
 
																-    input_variables: List[str]
															
 
																-    model_type: str  # 'openai' or 'llama'
															
 
																-
															
 
																-class PromptMigrationEngine:
															
 
																-    def __init__(self, source_lm: dspy.LM, target_lm: dspy.LM):
															
 
																-        self.source_lm = source_lm
															
 
																-        self.target_lm = target_lm
															
 
																-        dspy.configure(lm=source_lm)
															
 
																-    
															
 
																-    def _optimize_transformation(self, transformer, eval_dataset):
															
 
																-        """Optimize the transformation using the evaluation dataset."""
															
 
																-        class PromptQualityMetric:
															
 
																-            def __init__(self, source_lm, target_lm):
															
 
																-                self.source_lm = source_lm
															
 
																-                self.target_lm = target_lm
															
 
																-            
															
 
																-            def __call__(self, example, prediction, trace=None):
															
 
																-                if not hasattr(prediction, 'target'):
															
 
																-                    return 0.0
															
 
																-                
															
 
																-                try:
															
 
																-                    # Get outputs from both models using the prompts
															
 
																-                    source_output = self.source_lm(example.source)
															
 
																-                    target_output = self.target_lm(prediction.target)
															
 
																-                    
															
 
																-                    # Compare outputs (basic similarity)
															
 
																-                    from difflib import SequenceMatcher
															
 
																-                    similarity = SequenceMatcher(None, 
															
 
																-                                              str(source_output), 
															
 
																-                                              str(target_output)).ratio()
															
 
																-                    return similarity
															
 
																-                except Exception as e:
															
 
																-                    print(f"Error in metric: {e}")
															
 
																-                    return 0.0
															
 
																-        
															
 
																-        optimizer = dspy.BootstrapFewShotWithRandomSearch(
															
 
																-            metric=PromptQualityMetric(self.source_lm, self.target_lm),
															
 
																-            max_bootstrapped_demos=2,
															
 
																-            max_labeled_demos=2,
															
 
																-            num_threads=1
															
 
																-        )
															
 
																-        
															
 
																-        # Prepare training data
															
 
																-        train_data = []
															
 
																-        for item in eval_dataset:
															
 
																-            # Create example with both prompt and expected output
															
 
																-            example = dspy.Example(
															
 
																-                source=item["text"],
															
 
																-                expected_output=item["expected_answer"]
															
 
																-            ).with_inputs("source")
															
 
																-            train_data.append(example)
															
 
																-        
															
 
																-        return optimizer.compile(transformer, trainset=train_data)
															
 
																-    
															
 
																-    def migrate_prompt(self, 
															
 
																-                      source_prompt: PromptTemplate,
															
 
																-                      eval_dataset: Optional[List[Dict]] = None) -> PromptTemplate:
															
 
																-        """Migrates a prompt from source LM to target LM format."""
															
 
																-        
															
 
																-        class PromptTransformation(dspy.Signature):
															
 
																-            """Convert a prompt from one format to another."""
															
 
																-            source = dspy.InputField(desc="Source prompt template")
															
 
																-            target = dspy.OutputField(desc="Transformed prompt template that maintains functionality while adapting to target model format")
															
 
																-        
															
 
																-        class Transformer(dspy.Module):
															
 
																-            def __init__(self):
															
 
																-                super().__init__()
															
 
																-                self.chain = dspy.ChainOfThought(PromptTransformation)
															
 
																-            
															
 
																-            def forward(self, source):
															
 
																-                # Add context about the transformation task
															
 
																-                prompt = f"""
															
 
																-                Transform this prompt while:
															
 
																-                1. Maintaining core functionality
															
 
																-                2. Adapting to target model format
															
 
																-                3. Preserving input variables
															
 
																-                4. Keeping essential instructions
															
 
																-                
															
 
																-                Source prompt:
															
 
																-                {source}
															
 
																-                """
															
 
																-                return self.chain(source=prompt)
															
 
																-        
															
 
																-        transformer = Transformer()
															
 
																-        
															
 
																-        if eval_dataset:
															
 
																-            transformer = self._optimize_transformation(transformer, eval_dataset)
															
 
																-            
															
 
																-        result = transformer(source=source_prompt.template)
															
 
																-        
															
 
																-        # Format for target model
															
 
																-        if source_prompt.model_type == "openai" and "llama" in str(self.target_lm):
															
 
																-            result.target = f"### Instruction:\n{result.target}\n\n### Response:"
															
 
																-        
															
 
																-        return PromptTemplate(
															
 
																-            template=result.target,
															
 
																-            input_variables=source_prompt.input_variables,
															
 
																-            model_type='llama'
															
 
																-        ) 
															
--- a/end-to-end-use-cases/prompt-migration/prompt_migration/eval_dataset.py
+++ b/end-to-end-use-cases/prompt-migration/prompt_migration/eval_dataset.py
@@ -1,288 +0,0 @@
 
																-from typing import List, Dict
															
 
																-
															
 
																-def get_evaluation_dataset() -> List[Dict]:
															
 
																-    """
															
 
																-    Returns a comprehensive evaluation dataset for testing prompt migrations.
															
 
																-    Each test case includes:
															
 
																-    - text: Input text
															
 
																-    - expected_answer: Expected output
															
 
																-    - prompt_type: Type of prompt (summarization, classification, qa, etc.)
															
 
																-    - complexity: Difficulty level (simple, medium, complex)
															
 
																-    """
															
 
																-    return [
															
 
																-        # Summarization examples
															
 
																-        {
															
 
																-            "text": "The quick brown fox jumps over the lazy dog.",
															
 
																-            "expected_answer": "A fox jumps over a dog.",
															
 
																-            "prompt_type": "summarization",
															
 
																-            "complexity": "simple"
															
 
																-        },
															
 
																-        {
															
 
																-            "text": """Machine learning is a subset of artificial intelligence that focuses on developing 
															
 
																-                   systems that can learn from and make decisions based on data. It has numerous 
															
 
																-                   applications in various fields including healthcare, finance, and autonomous vehicles.""",
															
 
																-            "expected_answer": "Machine learning is an AI technology that enables systems to learn and make decisions from data, used in healthcare, finance, and autonomous vehicles.",
															
 
																-            "prompt_type": "summarization",
															
 
																-            "complexity": "medium"
															
 
																-        },
															
 
																-
															
 
																-        # Classification examples
															
 
																-        {
															
 
																-            "text": "I absolutely loved this product! Best purchase ever!",
															
 
																-            "expected_answer": "Positive",
															
 
																-            "prompt_type": "sentiment_classification",
															
 
																-            "complexity": "simple"
															
 
																-        },
															
 
																-        {
															
 
																-            "text": "The product works fine but the customer service could be better.",
															
 
																-            "expected_answer": "Neutral",
															
 
																-            "prompt_type": "sentiment_classification",
															
 
																-            "complexity": "medium"
															
 
																-        },
															
 
																-
															
 
																-        # Question-Answering examples
															
 
																-        {
															
 
																-            "text": "What is the capital of France? Context: Paris is the capital and largest city of France.",
															
 
																-            "expected_answer": "Paris",
															
 
																-            "prompt_type": "qa",
															
 
																-            "complexity": "simple"
															
 
																-        },
															
 
																-        {
															
 
																-            "text": """What causes rain? Context: Rain is precipitation of liquid water in the form of droplets. 
															
 
																-                   Water vapor in warm air rises and cools, forming clouds. When the droplets become too 
															
 
																-                   heavy, they fall as rain.""",
															
 
																-            "expected_answer": "Rain occurs when water vapor in warm air rises, cools to form clouds, and droplets become heavy enough to fall.",
															
 
																-            "prompt_type": "qa",
															
 
																-            "complexity": "medium"
															
 
																-        },
															
 
																-
															
 
																-        # Code-related examples
															
 
																-        {
															
 
																-            "text": "Write a function to add two numbers in Python.",
															
 
																-            "expected_answer": "def add(a, b):\n    return a + b",
															
 
																-            "prompt_type": "code_generation",
															
 
																-            "complexity": "simple"
															
 
																-        },
															
 
																-        {
															
 
																-            "text": "Explain what this code does: for i in range(len(arr)): arr[i] *= 2",
															
 
																-            "expected_answer": "This code multiplies each element in the array 'arr' by 2.",
															
 
																-            "prompt_type": "code_explanation",
															
 
																-            "complexity": "simple"
															
 
																-        },
															
 
																-
															
 
																-        # Text transformation examples
															
 
																-        {
															
 
																-            "text": "convert this to passive voice: The cat chased the mouse.",
															
 
																-            "expected_answer": "The mouse was chased by the cat.",
															
 
																-            "prompt_type": "text_transformation",
															
 
																-            "complexity": "simple"
															
 
																-        },
															
 
																-        {
															
 
																-            "text": "translate to French: Hello, how are you?",
															
 
																-            "expected_answer": "Bonjour, comment allez-vous?",
															
 
																-            "prompt_type": "translation",
															
 
																-            "complexity": "simple"
															
 
																-        },
															
 
																-
															
 
																-        # Complex reasoning examples
															
 
																-        {
															
 
																-            "text": """A train leaves Station A at 2:00 PM traveling at 60 mph. Another train leaves 
															
 
																-                   Station B at 3:00 PM traveling at 75 mph in the opposite direction. If the stations 
															
 
																-                   are 375 miles apart, at what time will the trains meet?""",
															
 
																-            "expected_answer": "The trains will meet at 5:00 PM.",
															
 
																-            "prompt_type": "problem_solving",
															
 
																-            "complexity": "complex"
															
 
																-        },
															
 
																-        {
															
 
																-            "text": """Analyze the environmental impact of electric vehicles versus traditional 
															
 
																-                   gasoline vehicles, considering manufacturing, operation, and disposal.""",
															
 
																-            "expected_answer": """Electric vehicles typically have higher manufacturing emissions but lower 
															
 
																-                              operational emissions compared to gasoline vehicles. Overall lifecycle 
															
 
																-                              environmental impact depends on electricity source and battery recycling.""",
															
 
																-            "prompt_type": "analysis",
															
 
																-            "complexity": "complex"
															
 
																-        },
															
 
																-
															
 
																-        # Simple Code Generation
															
 
																-        {
															
 
																-            "text": "Write a Python function to check if a number is prime.",
															
 
																-            "expected_answer": """def is_prime(n):
															
 
																-    if n < 2:
															
 
																-        return False
															
 
																-    for i in range(2, int(n ** 0.5) + 1):
															
 
																-        if n % i == 0:
															
 
																-            return False
															
 
																-    return True""",
															
 
																-            "prompt_type": "code_generation",
															
 
																-            "complexity": "medium"
															
 
																-        },
															
 
																-        {
															
 
																-            "text": "Create a Python function to reverse a string.",
															
 
																-            "expected_answer": """def reverse_string(s):
															
 
																-    return s[::-1]""",
															
 
																-            "prompt_type": "code_generation",
															
 
																-            "complexity": "simple"
															
 
																-        },
															
 
																-        
															
 
																-        # Code Explanation
															
 
																-        {
															
 
																-            "text": "Explain what this code does: [x*x for x in range(10) if x % 2 == 0]",
															
 
																-            "expected_answer": "This list comprehension creates a list of squares of even numbers from 0 to 9. It filters numbers where x modulo 2 equals 0 (even numbers) and squares them.",
															
 
																-            "prompt_type": "code_explanation",
															
 
																-            "complexity": "medium"
															
 
																-        },
															
 
																-        
															
 
																-        # Algorithm Implementation
															
 
																-        {
															
 
																-            "text": "Write a Python function to implement binary search.",
															
 
																-            "expected_answer": """def binary_search(arr, target):
															
 
																-    left, right = 0, len(arr) - 1
															
 
																-    
															
 
																-    while left <= right:
															
 
																-        mid = (left + right) // 2
															
 
																-        if arr[mid] == target:
															
 
																-            return mid
															
 
																-        elif arr[mid] < target:
															
 
																-            left = mid + 1
															
 
																-        else:
															
 
																-            right = mid - 1
															
 
																-            
															
 
																-    return -1""",
															
 
																-            "prompt_type": "code_generation",
															
 
																-            "complexity": "medium"
															
 
																-        },
															
 
																-        
															
 
																-        # Data Structure Implementation
															
 
																-        {
															
 
																-            "text": "Implement a Stack class in Python using a list.",
															
 
																-            "expected_answer": """class Stack:
															
 
																-    def __init__(self):
															
 
																-        self.items = []
															
 
																-        
															
 
																-    def push(self, item):
															
 
																-        self.items.append(item)
															
 
																-        
															
 
																-    def pop(self):
															
 
																-        if not self.is_empty():
															
 
																-            return self.items.pop()
															
 
																-        
															
 
																-    def is_empty(self):
															
 
																-        return len(self.items) == 0
															
 
																-        
															
 
																-    def peek(self):
															
 
																-        if not self.is_empty():
															
 
																-            return self.items[-1]""",
															
 
																-            "prompt_type": "code_generation",
															
 
																-            "complexity": "medium"
															
 
																-        },
															
 
																-        
															
 
																-        # Code Debugging
															
 
																-        {
															
 
																-            "text": "Find and fix the bug in this code: def factorial(n): return n * factorial(n-1)",
															
 
																-            "expected_answer": """def factorial(n):
															
 
																-    if n == 0 or n == 1:
															
 
																-        return 1
															
 
																-    return n * factorial(n-1)""",
															
 
																-            "prompt_type": "code_debugging",
															
 
																-            "complexity": "medium"
															
 
																-        },
															
 
																-        
															
 
																-        # Code Optimization
															
 
																-        {
															
 
																-            "text": "Optimize this code: def fibonacci(n): return fibonacci(n-1) + fibonacci(n-2) if n > 1 else n",
															
 
																-            "expected_answer": """def fibonacci(n):
															
 
																-    if n <= 1:
															
 
																-        return n
															
 
																-    a, b = 0, 1
															
 
																-    for _ in range(2, n + 1):
															
 
																-        a, b = b, a + b
															
 
																-    return b""",
															
 
																-            "prompt_type": "code_optimization",
															
 
																-            "complexity": "medium"
															
 
																-        },
															
 
																-        
															
 
																-        # API Usage
															
 
																-        {
															
 
																-            "text": "Write a Python function using requests to fetch data from a REST API endpoint.",
															
 
																-            "expected_answer": """import requests
															
 
																-
															
 
																-def fetch_data(url, params=None):
															
 
																-    try:
															
 
																-        response = requests.get(url, params=params)
															
 
																-        response.raise_for_status()
															
 
																-        return response.json()
															
 
																-    except requests.RequestException as e:
															
 
																-        print(f"Error fetching data: {e}")
															
 
																-        return None""",
															
 
																-            "prompt_type": "code_generation",
															
 
																-            "complexity": "medium"
															
 
																-        },
															
 
																-        
															
 
																-        # File Handling
															
 
																-        {
															
 
																-            "text": "Write a Python function to read a CSV file and return it as a list of dictionaries.",
															
 
																-            "expected_answer": """import csv
															
 
																-
															
 
																-def read_csv(file_path):
															
 
																-    data = []
															
 
																-    try:
															
 
																-        with open(file_path, 'r') as file:
															
 
																-            reader = csv.DictReader(file)
															
 
																-            for row in reader:
															
 
																-                data.append(row)
															
 
																-        return data
															
 
																-    except Exception as e:
															
 
																-        print(f"Error reading CSV: {e}")
															
 
																-        return None""",
															
 
																-            "prompt_type": "code_generation",
															
 
																-            "complexity": "medium"
															
 
																-        },
															
 
																-        
															
 
																-        # Error Handling
															
 
																-        {
															
 
																-            "text": "Write a Python function that safely converts a string to integer with error handling.",
															
 
																-            "expected_answer": """def safe_int_convert(s):
															
 
																-    try:
															
 
																-        return int(s), None
															
 
																-    except ValueError as e:
															
 
																-        return None, str(e)""",
															
 
																-            "prompt_type": "code_generation",
															
 
																-            "complexity": "simple"
															
 
																-        },
															
 
																-        
															
 
																-        # Complex Algorithm
															
 
																-        {
															
 
																-            "text": "Implement a Python function for Depth-First Search on a graph.",
															
 
																-            "expected_answer": """def dfs(graph, start, visited=None):
															
 
																-    if visited is None:
															
 
																-        visited = set()
															
 
																-    
															
 
																-    visited.add(start)
															
 
																-    
															
 
																-    for next_node in graph[start]:
															
 
																-        if next_node not in visited:
															
 
																-            dfs(graph, next_node, visited)
															
 
																-            
															
 
																-    return visited""",
															
 
																-            "prompt_type": "code_generation",
															
 
																-            "complexity": "complex"
															
 
																-        }
															
 
																-    ]
															
 
																-
															
 
																-def get_eval_subset(prompt_type: str = None, complexity: str = None) -> List[Dict]:
															
 
																-    """
															
 
																-    Returns a filtered subset of the evaluation dataset based on prompt type and/or complexity.
															
 
																-    
															
 
																-    Args:
															
 
																-        prompt_type: Type of prompts to filter (e.g., 'summarization', 'qa', etc.)
															
 
																-        complexity: Complexity level to filter (e.g., 'simple', 'medium', 'complex')
															
 
																-    """
															
 
																-    dataset = get_evaluation_dataset()
															
 
																-    
															
 
																-    if prompt_type:
															
 
																-        dataset = [d for d in dataset if d["prompt_type"] == prompt_type]
															
 
																-    
															
 
																-    if complexity:
															
 
																-        dataset = [d for d in dataset if d["complexity"] == complexity]
															
 
																-    
															
 
																-    return dataset 
															
--- a/end-to-end-use-cases/prompt-migration/prompt_migration/evaluator.py
+++ b/end-to-end-use-cases/prompt-migration/prompt_migration/evaluator.py
@@ -1,188 +0,0 @@
 
																-import json
															
 
																-from typing import List, Dict
															
 
																-from dataclasses import dataclass
															
 
																-import dspy
															
 
																-import os
															
 
																-from datetime import datetime
															
 
																-
															
 
																-@dataclass
															
 
																-class EvaluationMetrics:
															
 
																-    accuracy: float
															
 
																-    similarity: float
															
 
																-    consistency: float
															
 
																-    individual_scores: List[Dict]  # Store individual test case scores
															
 
																-
															
 
																-class PromptEvaluator:
															
 
																-    def __init__(self, source_lm: dspy.LM, target_lm: dspy.LM):
															
 
																-        self.source_lm = source_lm
															
 
																-        self.target_lm = target_lm
															
 
																-        dspy.configure(lm=source_lm)  # Configure DSPy to use source_lm for judge
															
 
																-        
															
 
																-    def _create_judge(self):
															
 
																-        """Create an LLM judge to evaluate outputs."""
															
 
																-        class OutputJudge(dspy.Signature):
															
 
																-            """Judge the quality and equivalence of outputs."""
															
 
																-            input_text = dspy.InputField(desc="The coding task")
															
 
																-            source_output = dspy.InputField(desc="Output from source prompt")
															
 
																-            target_output = dspy.InputField(desc="Output from target prompt")
															
 
																-            expected_output = dspy.InputField(desc="Expected output from dataset")
															
 
																-            
															
 
																-            equivalence = dspy.OutputField(
															
 
																-                desc="Are the outputs functionally equivalent to the expected output? Answer ONLY with 'yes' or 'no'."
															
 
																-            )
															
 
																-            accuracy = dspy.OutputField(
															
 
																-                desc="Rate how well the outputs match the expected output. Provide ONLY a number between 0 and 100, no text."
															
 
																-            )
															
 
																-            consistency = dspy.OutputField(
															
 
																-                desc="Rate how consistent the outputs are with each other. Provide ONLY a number between 0 and 100, no text."
															
 
																-            )
															
 
																-            reasoning = dspy.OutputField(
															
 
																-                desc="Explain your evaluation, focusing on functionality and correctness."
															
 
																-            )
															
 
																-
															
 
																-        class Judge(dspy.Module):
															
 
																-            def __init__(self):
															
 
																-                super().__init__()
															
 
																-                self.judge = dspy.ChainOfThought(OutputJudge)
															
 
																-            
															
 
																-            def forward(self, input_text, source_output, target_output, expected_output):
															
 
																-                try:
															
 
																-                    result = self.judge(
															
 
																-                        input_text=input_text,
															
 
																-                        source_output=source_output,
															
 
																-                        target_output=target_output,
															
 
																-                        expected_output=expected_output
															
 
																-                    )
															
 
																-                    
															
 
																-                    # Ensure numeric scores
															
 
																-                    def clean_score(score):
															
 
																-                        try:
															
 
																-                            # Extract just numbers
															
 
																-                            import re
															
 
																-                            numbers = re.findall(r'\d+', str(score))
															
 
																-                            return float(numbers[0]) if numbers else 0.0
															
 
																-                        except:
															
 
																-                            return 0.0
															
 
																-                    
															
 
																-                    result.accuracy = clean_score(result.accuracy)
															
 
																-                    result.consistency = clean_score(result.consistency)
															
 
																-                    result.equivalence = str(result.equivalence).lower().strip()
															
 
																-                    
															
 
																-                    return result
															
 
																-                except Exception as e:
															
 
																-                    print(f"Error in judge: {str(e)}")
															
 
																-                    return type('Result', (), {
															
 
																-                        'accuracy': '0',
															
 
																-                        'consistency': '0',
															
 
																-                        'equivalence': 'no',
															
 
																-                        'reasoning': f'Error in evaluation: {str(e)}'
															
 
																-                    })()
															
 
																-
															
 
																-        return Judge()
															
 
																-
															
 
																-    def _get_model_output(self, prompt: str, input_text: str) -> str:
															
 
																-        """Get output from target model using the provided prompt."""
															
 
																-        try:
															
 
																-            formatted_prompt = prompt.format(text=input_text)
															
 
																-            response = self.target_lm(formatted_prompt)
															
 
																-            
															
 
																-            if isinstance(response, list):
															
 
																-                return response[0] if response else ""
															
 
																-            return str(response)
															
 
																-        except Exception as e:
															
 
																-            print(f"Error generating output: {str(e)}")
															
 
																-            return ""
															
 
																-
															
 
																-    def _calculate_metrics(self, source_prompt: str, target_prompt: str, test_cases: List[Dict]) -> EvaluationMetrics:
															
 
																-        """Calculate evaluation metrics using target model for both prompts."""
															
 
																-        total_similarity = 0.0
															
 
																-        total_accuracy = 0.0
															
 
																-        total_consistency = 0.0
															
 
																-        individual_scores = []
															
 
																-        
															
 
																-        judge = self._create_judge()
															
 
																-        num_cases = len(test_cases)
															
 
																-        
															
 
																-        for case in test_cases:
															
 
																-            input_text = case["text"]
															
 
																-            expected = case["expected_answer"]
															
 
																-            
															
 
																-            # Get outputs from target model using both prompts
															
 
																-            source_output = self._get_model_output(source_prompt, input_text)
															
 
																-            target_output = self._get_model_output(target_prompt, input_text)
															
 
																-            
															
 
																-            judgment = judge(
															
 
																-                input_text=input_text,
															
 
																-                source_output=source_output,
															
 
																-                target_output=target_output,
															
 
																-                expected_output=expected
															
 
																-            )
															
 
																-            
															
 
																-            accuracy_score = float(judgment.accuracy) / 100
															
 
																-            consistency_score = float(judgment.consistency) / 100
															
 
																-            is_equivalent = judgment.equivalence.lower() == "yes"
															
 
																-            
															
 
																-            case_scores = {
															
 
																-                "input": input_text,
															
 
																-                "expected": expected,
															
 
																-                "source_output": source_output,
															
 
																-                "target_output": target_output,
															
 
																-                "accuracy": accuracy_score,
															
 
																-                "consistency": consistency_score,
															
 
																-                "equivalent": is_equivalent,
															
 
																-                "reasoning": judgment.reasoning
															
 
																-            }
															
 
																-            individual_scores.append(case_scores)
															
 
																-            
															
 
																-            total_accuracy += accuracy_score
															
 
																-            total_consistency += consistency_score
															
 
																-            total_similarity += float(is_equivalent)
															
 
																-            
															
 
																-            print(f"\nEvaluation for case: {input_text[:50]}...")
															
 
																-            print(f"Source output: {source_output[:100]}...")
															
 
																-            print(f"Target output: {target_output[:100]}...")
															
 
																-            print(f"Expected: {expected[:100]}...")
															
 
																-            print(f"Judge's reasoning: {judgment.reasoning}")
															
 
																-            print(f"Scores - Accuracy: {accuracy_score:.2f}, Consistency: {consistency_score:.2f}, Equivalent: {is_equivalent}")
															
 
																-        
															
 
																-        metrics = EvaluationMetrics(
															
 
																-            accuracy=total_accuracy / num_cases,
															
 
																-            similarity=total_similarity / num_cases,
															
 
																-            consistency=total_consistency / num_cases,
															
 
																-            individual_scores=individual_scores
															
 
																-        )
															
 
																-        
															
 
																-        results = {
															
 
																-            "source_prompt": source_prompt,
															
 
																-            "target_prompt": target_prompt,
															
 
																-            "aggregate_metrics": {
															
 
																-                "accuracy": metrics.accuracy,
															
 
																-                "similarity": metrics.similarity,
															
 
																-                "consistency": metrics.consistency
															
 
																-            },
															
 
																-            "individual_scores": individual_scores
															
 
																-        }
															
 
																-        
															
 
																-        self._save_results(results)
															
 
																-
															
 
																-        
															
 
																-        return metrics
															
 
																-    
															
 
																-    def evaluate(self, 
															
 
																-                source_prompt: str, 
															
 
																-                target_prompt: str, 
															
 
																-                test_cases: List[Dict]) -> EvaluationMetrics:
															
 
																-        """Evaluates both prompts using the target model."""
															
 
																-        return self._calculate_metrics(source_prompt, target_prompt, test_cases)
															
 
																-    
															
 
																-    def _save_results(self, results: dict, filename: str = 'results.json') -> None:
															
 
																-        """Save results to a JSON file with a new name if the file already exists."""
															
 
																-
															
 
																-        if os.path.exists(filename):
															
 
																-            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
															
 
																-            base, ext = os.path.splitext(filename)
															
 
																-            filename = f"{base}_{timestamp}{ext}"
															
 
																-        
															
 
																-        with open(filename, 'w') as f:
															
 
																-            json.dump(results, f, indent=2)
															
 
																-        print(f"Results saved to {filename}")
															
--- a/end-to-end-use-cases/prompt-migration/pyproject.toml
+++ b/end-to-end-use-cases/prompt-migration/pyproject.toml
@@ -0,0 +1,33 @@
 
																+[project]
															
 
																+name = "llama-prompt-migrator"
															
 
																+version = "0.1.0"
															
 
																+description = ""
															
 
																+authors = [{ name = "Cyrus Nouroozi", email = "cyrus@zenbase.ai" }]
															
 
																+dependencies = [
															
 
																+    "dspy @ git+https://github.com/stanfordnlp/dspy.git",
															
 
																+    "numpy<2",
															
 
																+    "lm-eval[wandb,api,math,ifeval,sentencepiece]>=0.4.7",
															
 
																+    "python-dotenv>=1.0.1",
															
 
																+]
															
 
																+readme = "README.md"
															
 
																+requires-python = ">= 3.10"
															
 
																+
															
 
																+[build-system]
															
 
																+requires = ["hatchling"]
															
 
																+build-backend = "hatchling.build"
															
 
																+
															
 
																+[tool.rye]
															
 
																+managed = true
															
 
																+dev-dependencies = [
															
 
																+    "ipdb>=0.13.13",
															
 
																+    "ipython>=8.31.0",
															
 
																+    "pytest>=8.3.4",
															
 
																+    "ruff>=0.9.1",
															
 
																+    "ipykernel>=6.29.5",
															
 
																+]
															
 
																+
															
 
																+[tool.hatch.metadata]
															
 
																+allow-direct-references = true
															
 
																+
															
 
																+[tool.hatch.build.targets.wheel]
															
 
																+packages = ["prompt_migrator/"]
															
--- a/end-to-end-use-cases/prompt-migration/readme.md
+++ b/end-to-end-use-cases/prompt-migration/readme.md
@@ -1 +0,0 @@
 
																-#TODO
															
--- a/end-to-end-use-cases/prompt-migration/v2/notebooks/harness.ipynb
+++ b/end-to-end-use-cases/prompt-migration/v2/notebooks/harness.ipynb