|
@@ -0,0 +1,90 @@
|
|
|
+import dspy
|
|
|
+from typing import List, Dict
|
|
|
+from dataclasses import dataclass
|
|
|
+
|
|
|
+@dataclass
|
|
|
+class EvaluationMetrics:
|
|
|
+ accuracy: float
|
|
|
+ similarity: float
|
|
|
+ consistency: float
|
|
|
+
|
|
|
+class PromptEvaluator:
|
|
|
+ def __init__(self, source_lm: dspy.OpenAI, target_lm: dspy.LM):
|
|
|
+ self.source_lm = source_lm
|
|
|
+ self.target_lm = target_lm
|
|
|
+
|
|
|
+ def _create_judge(self):
|
|
|
+ """Create an LLM judge to evaluate prompt outputs."""
|
|
|
+ class FactJudge(dspy.Signature):
|
|
|
+ """Judge if the migrated prompt produces equivalent outputs."""
|
|
|
+ source_output = dspy.InputField(desc="Output from source model")
|
|
|
+ target_output = dspy.InputField(desc="Output from target model")
|
|
|
+ factually_correct = dspy.OutputField(
|
|
|
+ desc="Is the target output equivalent to the source output in terms of content and intent?",
|
|
|
+ prefix="Factual[Yes/No]:"
|
|
|
+ )
|
|
|
+ reasoning = dspy.OutputField(desc="Explanation for the judgment")
|
|
|
+
|
|
|
+ return dspy.ChainOfThought(FactJudge)
|
|
|
+
|
|
|
+ def _get_model_output(self, model, text: str) -> str:
|
|
|
+ """Helper function to get output from different model types."""
|
|
|
+ try:
|
|
|
+ # Try different methods since DSPy model interfaces can vary
|
|
|
+ if hasattr(model, '__call__'):
|
|
|
+ return model(text)
|
|
|
+ elif hasattr(model, 'generate'):
|
|
|
+ return model.generate(text)
|
|
|
+ elif hasattr(model, 'complete'):
|
|
|
+ return model.complete(text)
|
|
|
+ else:
|
|
|
+ raise AttributeError(f"Model {type(model)} has no supported generation method")
|
|
|
+ except Exception as e:
|
|
|
+ print(f"Error generating output with {type(model)}: {str(e)}")
|
|
|
+ return ""
|
|
|
+
|
|
|
+ def _calculate_metrics(self, evaluator, test_cases):
|
|
|
+ """Calculate evaluation metrics using LLM as judge."""
|
|
|
+ total_similarity = 0.0
|
|
|
+ total_accuracy = 0.0
|
|
|
+ total_consistency = 0.0
|
|
|
+
|
|
|
+ judge = self._create_judge()
|
|
|
+
|
|
|
+ for case in test_cases:
|
|
|
+ source_output = self._get_model_output(self.source_lm, case["text"])
|
|
|
+ target_output = self._get_model_output(self.target_lm, case["text"])
|
|
|
+
|
|
|
+ judgment = judge(
|
|
|
+ source_output=source_output,
|
|
|
+ target_output=target_output
|
|
|
+ )
|
|
|
+
|
|
|
+ is_equivalent = judgment.factually_correct.lower() == "yes"
|
|
|
+
|
|
|
+ similarity = float(is_equivalent)
|
|
|
+ accuracy = float(target_output.lower() == case["expected_summary"].lower())
|
|
|
+ consistency = float(is_equivalent)
|
|
|
+
|
|
|
+ total_similarity += similarity
|
|
|
+ total_accuracy += accuracy
|
|
|
+ total_consistency += consistency
|
|
|
+
|
|
|
+ print(f"\nJudge's reasoning: {judgment.reasoning}")
|
|
|
+
|
|
|
+ n = len(test_cases)
|
|
|
+ return EvaluationMetrics(
|
|
|
+ accuracy=total_accuracy / n,
|
|
|
+ similarity=total_similarity / n,
|
|
|
+ consistency=total_consistency / n
|
|
|
+ )
|
|
|
+
|
|
|
+ def evaluate(self,
|
|
|
+ source_prompt: str,
|
|
|
+ target_prompt: str,
|
|
|
+ test_cases: List[Dict]) -> EvaluationMetrics:
|
|
|
+ """Evaluates the quality of prompt migration using LLM as judge."""
|
|
|
+
|
|
|
+ metrics = self._calculate_metrics(None, test_cases) # evaluator param not needed anymore
|
|
|
+
|
|
|
+ return metrics
|