123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288 |
- from typing import List, Dict
- def get_evaluation_dataset() -> List[Dict]:
- """
- Returns a comprehensive evaluation dataset for testing prompt migrations.
- Each test case includes:
- - text: Input text
- - expected_answer: Expected output
- - prompt_type: Type of prompt (summarization, classification, qa, etc.)
- - complexity: Difficulty level (simple, medium, complex)
- """
- return [
- # Summarization examples
- {
- "text": "The quick brown fox jumps over the lazy dog.",
- "expected_answer": "A fox jumps over a dog.",
- "prompt_type": "summarization",
- "complexity": "simple"
- },
- {
- "text": """Machine learning is a subset of artificial intelligence that focuses on developing
- systems that can learn from and make decisions based on data. It has numerous
- applications in various fields including healthcare, finance, and autonomous vehicles.""",
- "expected_answer": "Machine learning is an AI technology that enables systems to learn and make decisions from data, used in healthcare, finance, and autonomous vehicles.",
- "prompt_type": "summarization",
- "complexity": "medium"
- },
- # Classification examples
- {
- "text": "I absolutely loved this product! Best purchase ever!",
- "expected_answer": "Positive",
- "prompt_type": "sentiment_classification",
- "complexity": "simple"
- },
- {
- "text": "The product works fine but the customer service could be better.",
- "expected_answer": "Neutral",
- "prompt_type": "sentiment_classification",
- "complexity": "medium"
- },
- # Question-Answering examples
- {
- "text": "What is the capital of France? Context: Paris is the capital and largest city of France.",
- "expected_answer": "Paris",
- "prompt_type": "qa",
- "complexity": "simple"
- },
- {
- "text": """What causes rain? Context: Rain is precipitation of liquid water in the form of droplets.
- Water vapor in warm air rises and cools, forming clouds. When the droplets become too
- heavy, they fall as rain.""",
- "expected_answer": "Rain occurs when water vapor in warm air rises, cools to form clouds, and droplets become heavy enough to fall.",
- "prompt_type": "qa",
- "complexity": "medium"
- },
- # Code-related examples
- {
- "text": "Write a function to add two numbers in Python.",
- "expected_answer": "def add(a, b):\n return a + b",
- "prompt_type": "code_generation",
- "complexity": "simple"
- },
- {
- "text": "Explain what this code does: for i in range(len(arr)): arr[i] *= 2",
- "expected_answer": "This code multiplies each element in the array 'arr' by 2.",
- "prompt_type": "code_explanation",
- "complexity": "simple"
- },
- # Text transformation examples
- {
- "text": "convert this to passive voice: The cat chased the mouse.",
- "expected_answer": "The mouse was chased by the cat.",
- "prompt_type": "text_transformation",
- "complexity": "simple"
- },
- {
- "text": "translate to French: Hello, how are you?",
- "expected_answer": "Bonjour, comment allez-vous?",
- "prompt_type": "translation",
- "complexity": "simple"
- },
- # Complex reasoning examples
- {
- "text": """A train leaves Station A at 2:00 PM traveling at 60 mph. Another train leaves
- Station B at 3:00 PM traveling at 75 mph in the opposite direction. If the stations
- are 375 miles apart, at what time will the trains meet?""",
- "expected_answer": "The trains will meet at 5:00 PM.",
- "prompt_type": "problem_solving",
- "complexity": "complex"
- },
- {
- "text": """Analyze the environmental impact of electric vehicles versus traditional
- gasoline vehicles, considering manufacturing, operation, and disposal.""",
- "expected_answer": """Electric vehicles typically have higher manufacturing emissions but lower
- operational emissions compared to gasoline vehicles. Overall lifecycle
- environmental impact depends on electricity source and battery recycling.""",
- "prompt_type": "analysis",
- "complexity": "complex"
- },
- # Simple Code Generation
- {
- "text": "Write a Python function to check if a number is prime.",
- "expected_answer": """def is_prime(n):
- if n < 2:
- return False
- for i in range(2, int(n ** 0.5) + 1):
- if n % i == 0:
- return False
- return True""",
- "prompt_type": "code_generation",
- "complexity": "medium"
- },
- {
- "text": "Create a Python function to reverse a string.",
- "expected_answer": """def reverse_string(s):
- return s[::-1]""",
- "prompt_type": "code_generation",
- "complexity": "simple"
- },
-
- # Code Explanation
- {
- "text": "Explain what this code does: [x*x for x in range(10) if x % 2 == 0]",
- "expected_answer": "This list comprehension creates a list of squares of even numbers from 0 to 9. It filters numbers where x modulo 2 equals 0 (even numbers) and squares them.",
- "prompt_type": "code_explanation",
- "complexity": "medium"
- },
-
- # Algorithm Implementation
- {
- "text": "Write a Python function to implement binary search.",
- "expected_answer": """def binary_search(arr, target):
- left, right = 0, len(arr) - 1
-
- while left <= right:
- mid = (left + right) // 2
- if arr[mid] == target:
- return mid
- elif arr[mid] < target:
- left = mid + 1
- else:
- right = mid - 1
-
- return -1""",
- "prompt_type": "code_generation",
- "complexity": "medium"
- },
-
- # Data Structure Implementation
- {
- "text": "Implement a Stack class in Python using a list.",
- "expected_answer": """class Stack:
- def __init__(self):
- self.items = []
-
- def push(self, item):
- self.items.append(item)
-
- def pop(self):
- if not self.is_empty():
- return self.items.pop()
-
- def is_empty(self):
- return len(self.items) == 0
-
- def peek(self):
- if not self.is_empty():
- return self.items[-1]""",
- "prompt_type": "code_generation",
- "complexity": "medium"
- },
-
- # Code Debugging
- {
- "text": "Find and fix the bug in this code: def factorial(n): return n * factorial(n-1)",
- "expected_answer": """def factorial(n):
- if n == 0 or n == 1:
- return 1
- return n * factorial(n-1)""",
- "prompt_type": "code_debugging",
- "complexity": "medium"
- },
-
- # Code Optimization
- {
- "text": "Optimize this code: def fibonacci(n): return fibonacci(n-1) + fibonacci(n-2) if n > 1 else n",
- "expected_answer": """def fibonacci(n):
- if n <= 1:
- return n
- a, b = 0, 1
- for _ in range(2, n + 1):
- a, b = b, a + b
- return b""",
- "prompt_type": "code_optimization",
- "complexity": "medium"
- },
-
- # API Usage
- {
- "text": "Write a Python function using requests to fetch data from a REST API endpoint.",
- "expected_answer": """import requests
- def fetch_data(url, params=None):
- try:
- response = requests.get(url, params=params)
- response.raise_for_status()
- return response.json()
- except requests.RequestException as e:
- print(f"Error fetching data: {e}")
- return None""",
- "prompt_type": "code_generation",
- "complexity": "medium"
- },
-
- # File Handling
- {
- "text": "Write a Python function to read a CSV file and return it as a list of dictionaries.",
- "expected_answer": """import csv
- def read_csv(file_path):
- data = []
- try:
- with open(file_path, 'r') as file:
- reader = csv.DictReader(file)
- for row in reader:
- data.append(row)
- return data
- except Exception as e:
- print(f"Error reading CSV: {e}")
- return None""",
- "prompt_type": "code_generation",
- "complexity": "medium"
- },
-
- # Error Handling
- {
- "text": "Write a Python function that safely converts a string to integer with error handling.",
- "expected_answer": """def safe_int_convert(s):
- try:
- return int(s), None
- except ValueError as e:
- return None, str(e)""",
- "prompt_type": "code_generation",
- "complexity": "simple"
- },
-
- # Complex Algorithm
- {
- "text": "Implement a Python function for Depth-First Search on a graph.",
- "expected_answer": """def dfs(graph, start, visited=None):
- if visited is None:
- visited = set()
-
- visited.add(start)
-
- for next_node in graph[start]:
- if next_node not in visited:
- dfs(graph, next_node, visited)
-
- return visited""",
- "prompt_type": "code_generation",
- "complexity": "complex"
- }
- ]
- def get_eval_subset(prompt_type: str = None, complexity: str = None) -> List[Dict]:
- """
- Returns a filtered subset of the evaluation dataset based on prompt type and/or complexity.
-
- Args:
- prompt_type: Type of prompts to filter (e.g., 'summarization', 'qa', etc.)
- complexity: Complexity level to filter (e.g., 'simple', 'medium', 'complex')
- """
- dataset = get_evaluation_dataset()
-
- if prompt_type:
- dataset = [d for d in dataset if d["prompt_type"] == prompt_type]
-
- if complexity:
- dataset = [d for d in dataset if d["complexity"] == complexity]
-
- return dataset
|