瀏覽代碼

updated prompt migration to use benchmark and also mipro, added meta implementation

Justin Lee 3 月之前
父節點
當前提交
e52e1d1ab4

+ 8 - 0
end-to-end-use-cases/prompt-migration/README.md

@@ -0,0 +1,8 @@
+# prompt-migrator
+
+First, install Rye: https://rye.astral.sh/guide/installation/
+
+```
+rye sync
+. .venv/bin/activate
+```

end-to-end-use-cases/prompt-migration/prompt_migration/__init__.py → end-to-end-use-cases/prompt-migration/benchmarks/__init__.py


+ 10 - 0
end-to-end-use-cases/prompt-migration/benchmarks/datatypes.py

@@ -0,0 +1,10 @@
+import typing as t
+
+if t.TYPE_CHECKING:
+    import dspy
+
+
+class TaskDatasets(t.NamedTuple):
+    trainset: t.Iterable["dspy.Example"]
+    valset: t.Iterable["dspy.Example"]
+    testset: t.Iterable["dspy.Example"]

+ 25 - 0
end-to-end-use-cases/prompt-migration/benchmarks/helpers.py

@@ -0,0 +1,25 @@
+import typing as t
+
+from .datatypes import TaskDatasets
+
+if t.TYPE_CHECKING:
+    from datasets import Dataset
+    import dspy
+
+
+def train_val_test_split(
+    dataset: "Dataset",
+    mapper: t.Callable[[dict], "dspy.Example"],
+    train_size: float = 0.1,
+    validation_size: float = 0.2,
+) -> TaskDatasets:
+    docs = dataset.train_test_split(train_size=train_size)
+    train_docs = docs["train"]
+    docs = docs["test"].train_test_split(train_size=validation_size)
+    validation_docs = docs["train"]
+    test_docs = docs["test"]
+    return TaskDatasets(
+        trainset=list(map(mapper, train_docs)),
+        valset=list(map(mapper, validation_docs)),
+        testset=list(map(mapper, test_docs)),
+    )

+ 61 - 0
end-to-end-use-cases/prompt-migration/benchmarks/leaderboard_mmlu_pro.py

@@ -0,0 +1,61 @@
+import typing as t
+
+from datasets import load_dataset
+import dspy
+
+from .datatypes import TaskDatasets
+from .helpers import train_val_test_split
+
+
+def signature(instructions: str = "") -> dspy.Signature:
+    class MMLUPro(dspy.Signature):
+        __doc__ = instructions
+        question: str = dspy.InputField()
+        options: list[str] = dspy.InputField()
+        answer: str = dspy.OutputField()
+
+    return MMLUPro
+
+
+def metric(gold: dspy.Example, pred: dspy.Example, trace=False) -> bool:
+    return gold.answer == pred.answer
+
+
+def datasets(
+    train_size: float = 0.1,
+    validation_size: float = 0.2,
+) -> TaskDatasets:
+    dataset = load_dataset("TIGER-Lab/MMLU-Pro")
+    return train_val_test_split(
+        dataset["test"], _task_doc_example, train_size, validation_size
+    )
+
+
+class TaskDoc(t.TypedDict):
+    question_id: int
+    question: str
+    options: list[str]
+    answer: str
+    answer_index: int
+    cot_content: str
+    category: str
+    src: str
+
+
+inputs = ["question", "options"]
+outputs = ["answer"]
+
+
+def _num_letter(n: int) -> str:
+    return chr(ord("A") + n)
+
+
+def _task_doc_example(doc: TaskDoc) -> dspy.Example:
+    question = doc["question"]
+    options = [f"{_num_letter(i)}. {option}" for i, option in enumerate(doc["options"])]
+    answer = doc["answer"]
+    return dspy.Example(
+        question=question,
+        options=options,
+        answer=answer,
+    ).with_inputs(*inputs)

+ 48 - 0
end-to-end-use-cases/prompt-migration/benchmarks/llama_mmlu_pro.py

@@ -0,0 +1,48 @@
+import typing as t
+
+from datasets import load_dataset
+import dspy
+
+from .datatypes import TaskDatasets
+from .helpers import train_val_test_split
+
+
+def datasets(
+    train_size: float = 0.1,
+    validation_size: float = 0.2,
+) -> TaskDatasets:
+    """
+    TODO:
+    Load dataset, dataset should be datasets.Dataset type (NOT DatasetDict, OR split the dataset yourself how you want)
+    """
+    dataset = load_dataset("TODO")
+    return train_val_test_split(dataset, _task_doc_example, train_size, validation_size)
+
+
+class TaskDoc(t.TypedDict):
+    problem: str
+    gold: str
+
+
+inputs = ["problem"]
+outputs = ["answer"]
+
+
+def _task_doc_example(doc: TaskDoc) -> dspy.Example:
+    return dspy.Example(
+        problem=doc["problem"],
+        answer=doc["gold"],
+    ).with_inputs(*inputs)
+
+
+def signature(instructions: str = "") -> dspy.Signature:
+    class MMLUPro(dspy.Signature):
+        __doc__ = instructions
+        problem: str = dspy.InputField()
+        answer: str = dspy.OutputField()
+
+    return MMLUPro
+
+
+def metric(gold: dspy.Example, pred: dspy.Example, trace=False) -> bool:
+    return gold.answer == pred.answer

+ 15 - 6
end-to-end-use-cases/prompt-migration/environment.yml

@@ -2,12 +2,21 @@ name: prompt-migration
 channels:
 channels:
   - defaults
   - defaults
   - pytorch
   - pytorch
+  - conda-forge
 dependencies:
 dependencies:
-  - python=3.9
+  - python=3.10  # Updated to match pyproject.toml requires-python
   - pip
   - pip
+  - numpy<2  # Matches pyproject.toml dependency
   - pip:
   - pip:
-    - dspy-ai
-    - torch
-    - transformers
-    - openai
-    - databricks-sdk 
+      - dspy @ git+ssh://git@github.com/stanfordnlp/dspy.git
+      - lm-eval[wandb,api,math,ifeval,sentencepiece]>=0.4.7
+      - python-dotenv>=1.0.1
+      - ipdb>=0.13.13
+      - ipython>=8.31.0
+      - pytest>=8.3.4
+      - ruff>=0.9.1
+      - ipykernel>=6.29.5
+      - torch
+      - transformers
+      - openai
+      - databricks-sdk

+ 0 - 36
end-to-end-use-cases/prompt-migration/examples/usage.py

@@ -1,36 +0,0 @@
-import dspy
-from prompt_migration.engine import PromptMigrationEngine, PromptTemplate
-from prompt_migration.evaluator import PromptEvaluator
-
-# Initialize LMs
-openai_lm = dspy.OpenAI(model="gpt-3.5-turbo")
-target_lm = dspy.HFModel(model="gpt2")
-
-# Create migration engine
-engine = PromptMigrationEngine(openai_lm, target_lm)
-
-# Define source prompt
-source_prompt = PromptTemplate(
-    template="Summarize the following text: {text}",
-    input_variables=["text"],
-    model_type="openai"
-)
-
-eval_dataset = [
-    {"text": "Example text 1", "expected_answer": "Summary 1"},
-    {"text": "Example text 2", "expected_answer": "Summary 2"},
-]
-
-# Migrate prompt
-migrated_prompt = engine.migrate_prompt(source_prompt, eval_dataset)
-
-# Evaluate migration
-evaluator = PromptEvaluator(openai_lm, target_lm)
-metrics = evaluator.evaluate(
-    source_prompt.template,
-    migrated_prompt.template,
-    eval_dataset
-)
-
-print(f"Migrated prompt: {migrated_prompt.template}")
-print(f"Evaluation metrics: {metrics}") 

文件差異過大導致無法顯示
+ 980 - 0
end-to-end-use-cases/prompt-migration/harness.ipynb


+ 0 - 103
end-to-end-use-cases/prompt-migration/main.py

@@ -1,103 +0,0 @@
-import dspy
-from prompt_migration.engine import PromptMigrationEngine, PromptTemplate
-from prompt_migration.evaluator import PromptEvaluator
-from prompt_migration.eval_dataset import get_evaluation_dataset, get_eval_subset
-
-import os
-import dotenv
-
-dotenv.load_dotenv()
-
-def main():
-    openai_lm = dspy.LM(
-        model="gpt-3.5-turbo",
-        api_key=os.getenv("OPENAI_API_KEY")
-    )
-    
-    target_lm = dspy.LM(
-        model="together_ai/meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
-        api_key=os.getenv("TOGETHER_API_KEY")
-    )
-    # To run it with ollama
-    # target_lm = dspy.LM('ollama_chat/llama3.2:3b-instruct-fp16', api_base='http://localhost:11434', api_key='')
-
-    # To run it with huggingface
-    # target_lm = dspy.HFModel(model="gpt2")
-    
-    engine = PromptMigrationEngine(openai_lm, target_lm)
-    
-    source_prompt = PromptTemplate(
-        template="""You are an advanced Large Language Model tasked with generating Python code snippets in response to user prompts. Your primary objective is to provide accurate, concise, and well-structured Python functions. Follow these guidelines:
-
-    Understand the Context: Analyze the input prompt and identify its category (e.g., API Usage, File Handling, Error Handling).
-
-    Generate Code:
-        Write Python code that directly addresses the user's request.
-        Ensure the code is syntactically correct, functional, and adheres to Python best practices.
-        Include necessary imports and handle potential edge cases.
-
-    Error Handling:
-        Include appropriate error handling where applicable (e.g., try-except blocks).
-        If exceptions occur, provide meaningful error messages.
-
-    Readability:
-        Use clear variable names and include comments where necessary for clarity.
-        Prioritize readability and maintainability in all generated code.
-
-    Complexity Alignment:
-        Tailor the code's complexity based on the indicated difficulty (e.g., simple, medium, complex).
-        Ensure that the solution is neither overly simplistic nor unnecessarily complicated.
-
-    Prompt Type:
-        Focus on the code_generation type for creating Python functions.
-        Avoid deviating from the task unless additional clarification is requested.
-
-    Testing and Validity:
-        Assume the function might be run immediately. Provide code that is ready for use or minimal adaptation.
-        Highlight any dependencies or external libraries required.
-        """,
-        input_variables=["text"],
-        model_type="openai"
-    )
-    
-    eval_dataset = get_evaluation_dataset()
-
-
-    # To evaluate on a specific subset, use the following:
-    code_generation_dataset = get_eval_subset(prompt_type="code_generation")
-    #simple_tasks = get_eval_subset(complexity="simple")
-    evaluator = PromptEvaluator(openai_lm, target_lm)
-
-    metrics = evaluator.evaluate(
-        source_prompt.template,  # Same prompt for both
-        source_prompt.template,  # Same prompt for both
-        code_generation_dataset
-    )
-    
-    print(f"Evaluation metrics:")
-    print(f"  Accuracy: {metrics.accuracy:.2f}")
-    print(f"  Similarity: {metrics.similarity:.2f}")
-    print(f"  Consistency: {metrics.consistency:.2f}")
-    
-    # Migrate prompt
-    print("Migrating prompt...")
-    migrated_prompt = engine.migrate_prompt(source_prompt, code_generation_dataset)
-    
-    # Evaluate migration
-    print("Evaluating migration...")
-    metrics = evaluator.evaluate(
-        source_prompt.template,
-        migrated_prompt.template,
-        code_generation_dataset
-    )
-    
-    print(f"\nResults:")
-    print(f"Original prompt: {source_prompt.template}")
-    print(f"Migrated prompt: {migrated_prompt.template}")
-    print(f"Evaluation metrics:")
-    print(f"  Accuracy: {metrics.accuracy:.2f}")
-    print(f"  Similarity: {metrics.similarity:.2f}")
-    print(f"  Consistency: {metrics.consistency:.2f}")
-
-if __name__ == "__main__":
-    main() 

文件差異過大導致無法顯示
+ 851 - 0
end-to-end-use-cases/prompt-migration/notebooks/harness.ipynb


+ 0 - 106
end-to-end-use-cases/prompt-migration/prompt_migration/engine.py

@@ -1,106 +0,0 @@
-import dspy
-from typing import List, Dict, Optional
-from dataclasses import dataclass
-
-@dataclass
-class PromptTemplate:
-    template: str
-    input_variables: List[str]
-    model_type: str  # 'openai' or 'llama'
-
-class PromptMigrationEngine:
-    def __init__(self, source_lm: dspy.LM, target_lm: dspy.LM):
-        self.source_lm = source_lm
-        self.target_lm = target_lm
-        dspy.configure(lm=source_lm)
-    
-    def _optimize_transformation(self, transformer, eval_dataset):
-        """Optimize the transformation using the evaluation dataset."""
-        class PromptQualityMetric:
-            def __init__(self, source_lm, target_lm):
-                self.source_lm = source_lm
-                self.target_lm = target_lm
-            
-            def __call__(self, example, prediction, trace=None):
-                if not hasattr(prediction, 'target'):
-                    return 0.0
-                
-                try:
-                    # Get outputs from both models using the prompts
-                    source_output = self.source_lm(example.source)
-                    target_output = self.target_lm(prediction.target)
-                    
-                    # Compare outputs (basic similarity)
-                    from difflib import SequenceMatcher
-                    similarity = SequenceMatcher(None, 
-                                              str(source_output), 
-                                              str(target_output)).ratio()
-                    return similarity
-                except Exception as e:
-                    print(f"Error in metric: {e}")
-                    return 0.0
-        
-        optimizer = dspy.BootstrapFewShotWithRandomSearch(
-            metric=PromptQualityMetric(self.source_lm, self.target_lm),
-            max_bootstrapped_demos=2,
-            max_labeled_demos=2,
-            num_threads=1
-        )
-        
-        # Prepare training data
-        train_data = []
-        for item in eval_dataset:
-            # Create example with both prompt and expected output
-            example = dspy.Example(
-                source=item["text"],
-                expected_output=item["expected_answer"]
-            ).with_inputs("source")
-            train_data.append(example)
-        
-        return optimizer.compile(transformer, trainset=train_data)
-    
-    def migrate_prompt(self, 
-                      source_prompt: PromptTemplate,
-                      eval_dataset: Optional[List[Dict]] = None) -> PromptTemplate:
-        """Migrates a prompt from source LM to target LM format."""
-        
-        class PromptTransformation(dspy.Signature):
-            """Convert a prompt from one format to another."""
-            source = dspy.InputField(desc="Source prompt template")
-            target = dspy.OutputField(desc="Transformed prompt template that maintains functionality while adapting to target model format")
-        
-        class Transformer(dspy.Module):
-            def __init__(self):
-                super().__init__()
-                self.chain = dspy.ChainOfThought(PromptTransformation)
-            
-            def forward(self, source):
-                # Add context about the transformation task
-                prompt = f"""
-                Transform this prompt while:
-                1. Maintaining core functionality
-                2. Adapting to target model format
-                3. Preserving input variables
-                4. Keeping essential instructions
-                
-                Source prompt:
-                {source}
-                """
-                return self.chain(source=prompt)
-        
-        transformer = Transformer()
-        
-        if eval_dataset:
-            transformer = self._optimize_transformation(transformer, eval_dataset)
-            
-        result = transformer(source=source_prompt.template)
-        
-        # Format for target model
-        if source_prompt.model_type == "openai" and "llama" in str(self.target_lm):
-            result.target = f"### Instruction:\n{result.target}\n\n### Response:"
-        
-        return PromptTemplate(
-            template=result.target,
-            input_variables=source_prompt.input_variables,
-            model_type='llama'
-        ) 

+ 0 - 288
end-to-end-use-cases/prompt-migration/prompt_migration/eval_dataset.py

@@ -1,288 +0,0 @@
-from typing import List, Dict
-
-def get_evaluation_dataset() -> List[Dict]:
-    """
-    Returns a comprehensive evaluation dataset for testing prompt migrations.
-    Each test case includes:
-    - text: Input text
-    - expected_answer: Expected output
-    - prompt_type: Type of prompt (summarization, classification, qa, etc.)
-    - complexity: Difficulty level (simple, medium, complex)
-    """
-    return [
-        # Summarization examples
-        {
-            "text": "The quick brown fox jumps over the lazy dog.",
-            "expected_answer": "A fox jumps over a dog.",
-            "prompt_type": "summarization",
-            "complexity": "simple"
-        },
-        {
-            "text": """Machine learning is a subset of artificial intelligence that focuses on developing 
-                   systems that can learn from and make decisions based on data. It has numerous 
-                   applications in various fields including healthcare, finance, and autonomous vehicles.""",
-            "expected_answer": "Machine learning is an AI technology that enables systems to learn and make decisions from data, used in healthcare, finance, and autonomous vehicles.",
-            "prompt_type": "summarization",
-            "complexity": "medium"
-        },
-
-        # Classification examples
-        {
-            "text": "I absolutely loved this product! Best purchase ever!",
-            "expected_answer": "Positive",
-            "prompt_type": "sentiment_classification",
-            "complexity": "simple"
-        },
-        {
-            "text": "The product works fine but the customer service could be better.",
-            "expected_answer": "Neutral",
-            "prompt_type": "sentiment_classification",
-            "complexity": "medium"
-        },
-
-        # Question-Answering examples
-        {
-            "text": "What is the capital of France? Context: Paris is the capital and largest city of France.",
-            "expected_answer": "Paris",
-            "prompt_type": "qa",
-            "complexity": "simple"
-        },
-        {
-            "text": """What causes rain? Context: Rain is precipitation of liquid water in the form of droplets. 
-                   Water vapor in warm air rises and cools, forming clouds. When the droplets become too 
-                   heavy, they fall as rain.""",
-            "expected_answer": "Rain occurs when water vapor in warm air rises, cools to form clouds, and droplets become heavy enough to fall.",
-            "prompt_type": "qa",
-            "complexity": "medium"
-        },
-
-        # Code-related examples
-        {
-            "text": "Write a function to add two numbers in Python.",
-            "expected_answer": "def add(a, b):\n    return a + b",
-            "prompt_type": "code_generation",
-            "complexity": "simple"
-        },
-        {
-            "text": "Explain what this code does: for i in range(len(arr)): arr[i] *= 2",
-            "expected_answer": "This code multiplies each element in the array 'arr' by 2.",
-            "prompt_type": "code_explanation",
-            "complexity": "simple"
-        },
-
-        # Text transformation examples
-        {
-            "text": "convert this to passive voice: The cat chased the mouse.",
-            "expected_answer": "The mouse was chased by the cat.",
-            "prompt_type": "text_transformation",
-            "complexity": "simple"
-        },
-        {
-            "text": "translate to French: Hello, how are you?",
-            "expected_answer": "Bonjour, comment allez-vous?",
-            "prompt_type": "translation",
-            "complexity": "simple"
-        },
-
-        # Complex reasoning examples
-        {
-            "text": """A train leaves Station A at 2:00 PM traveling at 60 mph. Another train leaves 
-                   Station B at 3:00 PM traveling at 75 mph in the opposite direction. If the stations 
-                   are 375 miles apart, at what time will the trains meet?""",
-            "expected_answer": "The trains will meet at 5:00 PM.",
-            "prompt_type": "problem_solving",
-            "complexity": "complex"
-        },
-        {
-            "text": """Analyze the environmental impact of electric vehicles versus traditional 
-                   gasoline vehicles, considering manufacturing, operation, and disposal.""",
-            "expected_answer": """Electric vehicles typically have higher manufacturing emissions but lower 
-                              operational emissions compared to gasoline vehicles. Overall lifecycle 
-                              environmental impact depends on electricity source and battery recycling.""",
-            "prompt_type": "analysis",
-            "complexity": "complex"
-        },
-
-        # Simple Code Generation
-        {
-            "text": "Write a Python function to check if a number is prime.",
-            "expected_answer": """def is_prime(n):
-    if n < 2:
-        return False
-    for i in range(2, int(n ** 0.5) + 1):
-        if n % i == 0:
-            return False
-    return True""",
-            "prompt_type": "code_generation",
-            "complexity": "medium"
-        },
-        {
-            "text": "Create a Python function to reverse a string.",
-            "expected_answer": """def reverse_string(s):
-    return s[::-1]""",
-            "prompt_type": "code_generation",
-            "complexity": "simple"
-        },
-        
-        # Code Explanation
-        {
-            "text": "Explain what this code does: [x*x for x in range(10) if x % 2 == 0]",
-            "expected_answer": "This list comprehension creates a list of squares of even numbers from 0 to 9. It filters numbers where x modulo 2 equals 0 (even numbers) and squares them.",
-            "prompt_type": "code_explanation",
-            "complexity": "medium"
-        },
-        
-        # Algorithm Implementation
-        {
-            "text": "Write a Python function to implement binary search.",
-            "expected_answer": """def binary_search(arr, target):
-    left, right = 0, len(arr) - 1
-    
-    while left <= right:
-        mid = (left + right) // 2
-        if arr[mid] == target:
-            return mid
-        elif arr[mid] < target:
-            left = mid + 1
-        else:
-            right = mid - 1
-            
-    return -1""",
-            "prompt_type": "code_generation",
-            "complexity": "medium"
-        },
-        
-        # Data Structure Implementation
-        {
-            "text": "Implement a Stack class in Python using a list.",
-            "expected_answer": """class Stack:
-    def __init__(self):
-        self.items = []
-        
-    def push(self, item):
-        self.items.append(item)
-        
-    def pop(self):
-        if not self.is_empty():
-            return self.items.pop()
-        
-    def is_empty(self):
-        return len(self.items) == 0
-        
-    def peek(self):
-        if not self.is_empty():
-            return self.items[-1]""",
-            "prompt_type": "code_generation",
-            "complexity": "medium"
-        },
-        
-        # Code Debugging
-        {
-            "text": "Find and fix the bug in this code: def factorial(n): return n * factorial(n-1)",
-            "expected_answer": """def factorial(n):
-    if n == 0 or n == 1:
-        return 1
-    return n * factorial(n-1)""",
-            "prompt_type": "code_debugging",
-            "complexity": "medium"
-        },
-        
-        # Code Optimization
-        {
-            "text": "Optimize this code: def fibonacci(n): return fibonacci(n-1) + fibonacci(n-2) if n > 1 else n",
-            "expected_answer": """def fibonacci(n):
-    if n <= 1:
-        return n
-    a, b = 0, 1
-    for _ in range(2, n + 1):
-        a, b = b, a + b
-    return b""",
-            "prompt_type": "code_optimization",
-            "complexity": "medium"
-        },
-        
-        # API Usage
-        {
-            "text": "Write a Python function using requests to fetch data from a REST API endpoint.",
-            "expected_answer": """import requests
-
-def fetch_data(url, params=None):
-    try:
-        response = requests.get(url, params=params)
-        response.raise_for_status()
-        return response.json()
-    except requests.RequestException as e:
-        print(f"Error fetching data: {e}")
-        return None""",
-            "prompt_type": "code_generation",
-            "complexity": "medium"
-        },
-        
-        # File Handling
-        {
-            "text": "Write a Python function to read a CSV file and return it as a list of dictionaries.",
-            "expected_answer": """import csv
-
-def read_csv(file_path):
-    data = []
-    try:
-        with open(file_path, 'r') as file:
-            reader = csv.DictReader(file)
-            for row in reader:
-                data.append(row)
-        return data
-    except Exception as e:
-        print(f"Error reading CSV: {e}")
-        return None""",
-            "prompt_type": "code_generation",
-            "complexity": "medium"
-        },
-        
-        # Error Handling
-        {
-            "text": "Write a Python function that safely converts a string to integer with error handling.",
-            "expected_answer": """def safe_int_convert(s):
-    try:
-        return int(s), None
-    except ValueError as e:
-        return None, str(e)""",
-            "prompt_type": "code_generation",
-            "complexity": "simple"
-        },
-        
-        # Complex Algorithm
-        {
-            "text": "Implement a Python function for Depth-First Search on a graph.",
-            "expected_answer": """def dfs(graph, start, visited=None):
-    if visited is None:
-        visited = set()
-    
-    visited.add(start)
-    
-    for next_node in graph[start]:
-        if next_node not in visited:
-            dfs(graph, next_node, visited)
-            
-    return visited""",
-            "prompt_type": "code_generation",
-            "complexity": "complex"
-        }
-    ]
-
-def get_eval_subset(prompt_type: str = None, complexity: str = None) -> List[Dict]:
-    """
-    Returns a filtered subset of the evaluation dataset based on prompt type and/or complexity.
-    
-    Args:
-        prompt_type: Type of prompts to filter (e.g., 'summarization', 'qa', etc.)
-        complexity: Complexity level to filter (e.g., 'simple', 'medium', 'complex')
-    """
-    dataset = get_evaluation_dataset()
-    
-    if prompt_type:
-        dataset = [d for d in dataset if d["prompt_type"] == prompt_type]
-    
-    if complexity:
-        dataset = [d for d in dataset if d["complexity"] == complexity]
-    
-    return dataset 

+ 0 - 188
end-to-end-use-cases/prompt-migration/prompt_migration/evaluator.py

@@ -1,188 +0,0 @@
-import json
-from typing import List, Dict
-from dataclasses import dataclass
-import dspy
-import os
-from datetime import datetime
-
-@dataclass
-class EvaluationMetrics:
-    accuracy: float
-    similarity: float
-    consistency: float
-    individual_scores: List[Dict]  # Store individual test case scores
-
-class PromptEvaluator:
-    def __init__(self, source_lm: dspy.LM, target_lm: dspy.LM):
-        self.source_lm = source_lm
-        self.target_lm = target_lm
-        dspy.configure(lm=source_lm)  # Configure DSPy to use source_lm for judge
-        
-    def _create_judge(self):
-        """Create an LLM judge to evaluate outputs."""
-        class OutputJudge(dspy.Signature):
-            """Judge the quality and equivalence of outputs."""
-            input_text = dspy.InputField(desc="The coding task")
-            source_output = dspy.InputField(desc="Output from source prompt")
-            target_output = dspy.InputField(desc="Output from target prompt")
-            expected_output = dspy.InputField(desc="Expected output from dataset")
-            
-            equivalence = dspy.OutputField(
-                desc="Are the outputs functionally equivalent to the expected output? Answer ONLY with 'yes' or 'no'."
-            )
-            accuracy = dspy.OutputField(
-                desc="Rate how well the outputs match the expected output. Provide ONLY a number between 0 and 100, no text."
-            )
-            consistency = dspy.OutputField(
-                desc="Rate how consistent the outputs are with each other. Provide ONLY a number between 0 and 100, no text."
-            )
-            reasoning = dspy.OutputField(
-                desc="Explain your evaluation, focusing on functionality and correctness."
-            )
-
-        class Judge(dspy.Module):
-            def __init__(self):
-                super().__init__()
-                self.judge = dspy.ChainOfThought(OutputJudge)
-            
-            def forward(self, input_text, source_output, target_output, expected_output):
-                try:
-                    result = self.judge(
-                        input_text=input_text,
-                        source_output=source_output,
-                        target_output=target_output,
-                        expected_output=expected_output
-                    )
-                    
-                    # Ensure numeric scores
-                    def clean_score(score):
-                        try:
-                            # Extract just numbers
-                            import re
-                            numbers = re.findall(r'\d+', str(score))
-                            return float(numbers[0]) if numbers else 0.0
-                        except:
-                            return 0.0
-                    
-                    result.accuracy = clean_score(result.accuracy)
-                    result.consistency = clean_score(result.consistency)
-                    result.equivalence = str(result.equivalence).lower().strip()
-                    
-                    return result
-                except Exception as e:
-                    print(f"Error in judge: {str(e)}")
-                    return type('Result', (), {
-                        'accuracy': '0',
-                        'consistency': '0',
-                        'equivalence': 'no',
-                        'reasoning': f'Error in evaluation: {str(e)}'
-                    })()
-
-        return Judge()
-
-    def _get_model_output(self, prompt: str, input_text: str) -> str:
-        """Get output from target model using the provided prompt."""
-        try:
-            formatted_prompt = prompt.format(text=input_text)
-            response = self.target_lm(formatted_prompt)
-            
-            if isinstance(response, list):
-                return response[0] if response else ""
-            return str(response)
-        except Exception as e:
-            print(f"Error generating output: {str(e)}")
-            return ""
-
-    def _calculate_metrics(self, source_prompt: str, target_prompt: str, test_cases: List[Dict]) -> EvaluationMetrics:
-        """Calculate evaluation metrics using target model for both prompts."""
-        total_similarity = 0.0
-        total_accuracy = 0.0
-        total_consistency = 0.0
-        individual_scores = []
-        
-        judge = self._create_judge()
-        num_cases = len(test_cases)
-        
-        for case in test_cases:
-            input_text = case["text"]
-            expected = case["expected_answer"]
-            
-            # Get outputs from target model using both prompts
-            source_output = self._get_model_output(source_prompt, input_text)
-            target_output = self._get_model_output(target_prompt, input_text)
-            
-            judgment = judge(
-                input_text=input_text,
-                source_output=source_output,
-                target_output=target_output,
-                expected_output=expected
-            )
-            
-            accuracy_score = float(judgment.accuracy) / 100
-            consistency_score = float(judgment.consistency) / 100
-            is_equivalent = judgment.equivalence.lower() == "yes"
-            
-            case_scores = {
-                "input": input_text,
-                "expected": expected,
-                "source_output": source_output,
-                "target_output": target_output,
-                "accuracy": accuracy_score,
-                "consistency": consistency_score,
-                "equivalent": is_equivalent,
-                "reasoning": judgment.reasoning
-            }
-            individual_scores.append(case_scores)
-            
-            total_accuracy += accuracy_score
-            total_consistency += consistency_score
-            total_similarity += float(is_equivalent)
-            
-            print(f"\nEvaluation for case: {input_text[:50]}...")
-            print(f"Source output: {source_output[:100]}...")
-            print(f"Target output: {target_output[:100]}...")
-            print(f"Expected: {expected[:100]}...")
-            print(f"Judge's reasoning: {judgment.reasoning}")
-            print(f"Scores - Accuracy: {accuracy_score:.2f}, Consistency: {consistency_score:.2f}, Equivalent: {is_equivalent}")
-        
-        metrics = EvaluationMetrics(
-            accuracy=total_accuracy / num_cases,
-            similarity=total_similarity / num_cases,
-            consistency=total_consistency / num_cases,
-            individual_scores=individual_scores
-        )
-        
-        results = {
-            "source_prompt": source_prompt,
-            "target_prompt": target_prompt,
-            "aggregate_metrics": {
-                "accuracy": metrics.accuracy,
-                "similarity": metrics.similarity,
-                "consistency": metrics.consistency
-            },
-            "individual_scores": individual_scores
-        }
-        
-        self._save_results(results)
-
-        
-        return metrics
-    
-    def evaluate(self, 
-                source_prompt: str, 
-                target_prompt: str, 
-                test_cases: List[Dict]) -> EvaluationMetrics:
-        """Evaluates both prompts using the target model."""
-        return self._calculate_metrics(source_prompt, target_prompt, test_cases)
-    
-    def _save_results(self, results: dict, filename: str = 'results.json') -> None:
-        """Save results to a JSON file with a new name if the file already exists."""
-
-        if os.path.exists(filename):
-            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
-            base, ext = os.path.splitext(filename)
-            filename = f"{base}_{timestamp}{ext}"
-        
-        with open(filename, 'w') as f:
-            json.dump(results, f, indent=2)
-        print(f"Results saved to {filename}")

+ 33 - 0
end-to-end-use-cases/prompt-migration/pyproject.toml

@@ -0,0 +1,33 @@
+[project]
+name = "llama-prompt-migrator"
+version = "0.1.0"
+description = ""
+authors = [{ name = "Cyrus Nouroozi", email = "cyrus@zenbase.ai" }]
+dependencies = [
+    "dspy @ git+https://github.com/stanfordnlp/dspy.git",
+    "numpy<2",
+    "lm-eval[wandb,api,math,ifeval,sentencepiece]>=0.4.7",
+    "python-dotenv>=1.0.1",
+]
+readme = "README.md"
+requires-python = ">= 3.10"
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.rye]
+managed = true
+dev-dependencies = [
+    "ipdb>=0.13.13",
+    "ipython>=8.31.0",
+    "pytest>=8.3.4",
+    "ruff>=0.9.1",
+    "ipykernel>=6.29.5",
+]
+
+[tool.hatch.metadata]
+allow-direct-references = true
+
+[tool.hatch.build.targets.wheel]
+packages = ["prompt_migrator/"]

+ 0 - 1
end-to-end-use-cases/prompt-migration/readme.md

@@ -1 +0,0 @@
-#TODO

文件差異過大導致無法顯示
+ 1080 - 0
end-to-end-use-cases/prompt-migration/v2/notebooks/harness.ipynb