eval_dataset.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288
  1. from typing import List, Dict
  2. def get_evaluation_dataset() -> List[Dict]:
  3. """
  4. Returns a comprehensive evaluation dataset for testing prompt migrations.
  5. Each test case includes:
  6. - text: Input text
  7. - expected_answer: Expected output
  8. - prompt_type: Type of prompt (summarization, classification, qa, etc.)
  9. - complexity: Difficulty level (simple, medium, complex)
  10. """
  11. return [
  12. # Summarization examples
  13. {
  14. "text": "The quick brown fox jumps over the lazy dog.",
  15. "expected_answer": "A fox jumps over a dog.",
  16. "prompt_type": "summarization",
  17. "complexity": "simple"
  18. },
  19. {
  20. "text": """Machine learning is a subset of artificial intelligence that focuses on developing
  21. systems that can learn from and make decisions based on data. It has numerous
  22. applications in various fields including healthcare, finance, and autonomous vehicles.""",
  23. "expected_answer": "Machine learning is an AI technology that enables systems to learn and make decisions from data, used in healthcare, finance, and autonomous vehicles.",
  24. "prompt_type": "summarization",
  25. "complexity": "medium"
  26. },
  27. # Classification examples
  28. {
  29. "text": "I absolutely loved this product! Best purchase ever!",
  30. "expected_answer": "Positive",
  31. "prompt_type": "sentiment_classification",
  32. "complexity": "simple"
  33. },
  34. {
  35. "text": "The product works fine but the customer service could be better.",
  36. "expected_answer": "Neutral",
  37. "prompt_type": "sentiment_classification",
  38. "complexity": "medium"
  39. },
  40. # Question-Answering examples
  41. {
  42. "text": "What is the capital of France? Context: Paris is the capital and largest city of France.",
  43. "expected_answer": "Paris",
  44. "prompt_type": "qa",
  45. "complexity": "simple"
  46. },
  47. {
  48. "text": """What causes rain? Context: Rain is precipitation of liquid water in the form of droplets.
  49. Water vapor in warm air rises and cools, forming clouds. When the droplets become too
  50. heavy, they fall as rain.""",
  51. "expected_answer": "Rain occurs when water vapor in warm air rises, cools to form clouds, and droplets become heavy enough to fall.",
  52. "prompt_type": "qa",
  53. "complexity": "medium"
  54. },
  55. # Code-related examples
  56. {
  57. "text": "Write a function to add two numbers in Python.",
  58. "expected_answer": "def add(a, b):\n return a + b",
  59. "prompt_type": "code_generation",
  60. "complexity": "simple"
  61. },
  62. {
  63. "text": "Explain what this code does: for i in range(len(arr)): arr[i] *= 2",
  64. "expected_answer": "This code multiplies each element in the array 'arr' by 2.",
  65. "prompt_type": "code_explanation",
  66. "complexity": "simple"
  67. },
  68. # Text transformation examples
  69. {
  70. "text": "convert this to passive voice: The cat chased the mouse.",
  71. "expected_answer": "The mouse was chased by the cat.",
  72. "prompt_type": "text_transformation",
  73. "complexity": "simple"
  74. },
  75. {
  76. "text": "translate to French: Hello, how are you?",
  77. "expected_answer": "Bonjour, comment allez-vous?",
  78. "prompt_type": "translation",
  79. "complexity": "simple"
  80. },
  81. # Complex reasoning examples
  82. {
  83. "text": """A train leaves Station A at 2:00 PM traveling at 60 mph. Another train leaves
  84. Station B at 3:00 PM traveling at 75 mph in the opposite direction. If the stations
  85. are 375 miles apart, at what time will the trains meet?""",
  86. "expected_answer": "The trains will meet at 5:00 PM.",
  87. "prompt_type": "problem_solving",
  88. "complexity": "complex"
  89. },
  90. {
  91. "text": """Analyze the environmental impact of electric vehicles versus traditional
  92. gasoline vehicles, considering manufacturing, operation, and disposal.""",
  93. "expected_answer": """Electric vehicles typically have higher manufacturing emissions but lower
  94. operational emissions compared to gasoline vehicles. Overall lifecycle
  95. environmental impact depends on electricity source and battery recycling.""",
  96. "prompt_type": "analysis",
  97. "complexity": "complex"
  98. },
  99. # Simple Code Generation
  100. {
  101. "text": "Write a Python function to check if a number is prime.",
  102. "expected_answer": """def is_prime(n):
  103. if n < 2:
  104. return False
  105. for i in range(2, int(n ** 0.5) + 1):
  106. if n % i == 0:
  107. return False
  108. return True""",
  109. "prompt_type": "code_generation",
  110. "complexity": "medium"
  111. },
  112. {
  113. "text": "Create a Python function to reverse a string.",
  114. "expected_answer": """def reverse_string(s):
  115. return s[::-1]""",
  116. "prompt_type": "code_generation",
  117. "complexity": "simple"
  118. },
  119. # Code Explanation
  120. {
  121. "text": "Explain what this code does: [x*x for x in range(10) if x % 2 == 0]",
  122. "expected_answer": "This list comprehension creates a list of squares of even numbers from 0 to 9. It filters numbers where x modulo 2 equals 0 (even numbers) and squares them.",
  123. "prompt_type": "code_explanation",
  124. "complexity": "medium"
  125. },
  126. # Algorithm Implementation
  127. {
  128. "text": "Write a Python function to implement binary search.",
  129. "expected_answer": """def binary_search(arr, target):
  130. left, right = 0, len(arr) - 1
  131. while left <= right:
  132. mid = (left + right) // 2
  133. if arr[mid] == target:
  134. return mid
  135. elif arr[mid] < target:
  136. left = mid + 1
  137. else:
  138. right = mid - 1
  139. return -1""",
  140. "prompt_type": "code_generation",
  141. "complexity": "medium"
  142. },
  143. # Data Structure Implementation
  144. {
  145. "text": "Implement a Stack class in Python using a list.",
  146. "expected_answer": """class Stack:
  147. def __init__(self):
  148. self.items = []
  149. def push(self, item):
  150. self.items.append(item)
  151. def pop(self):
  152. if not self.is_empty():
  153. return self.items.pop()
  154. def is_empty(self):
  155. return len(self.items) == 0
  156. def peek(self):
  157. if not self.is_empty():
  158. return self.items[-1]""",
  159. "prompt_type": "code_generation",
  160. "complexity": "medium"
  161. },
  162. # Code Debugging
  163. {
  164. "text": "Find and fix the bug in this code: def factorial(n): return n * factorial(n-1)",
  165. "expected_answer": """def factorial(n):
  166. if n == 0 or n == 1:
  167. return 1
  168. return n * factorial(n-1)""",
  169. "prompt_type": "code_debugging",
  170. "complexity": "medium"
  171. },
  172. # Code Optimization
  173. {
  174. "text": "Optimize this code: def fibonacci(n): return fibonacci(n-1) + fibonacci(n-2) if n > 1 else n",
  175. "expected_answer": """def fibonacci(n):
  176. if n <= 1:
  177. return n
  178. a, b = 0, 1
  179. for _ in range(2, n + 1):
  180. a, b = b, a + b
  181. return b""",
  182. "prompt_type": "code_optimization",
  183. "complexity": "medium"
  184. },
  185. # API Usage
  186. {
  187. "text": "Write a Python function using requests to fetch data from a REST API endpoint.",
  188. "expected_answer": """import requests
  189. def fetch_data(url, params=None):
  190. try:
  191. response = requests.get(url, params=params)
  192. response.raise_for_status()
  193. return response.json()
  194. except requests.RequestException as e:
  195. print(f"Error fetching data: {e}")
  196. return None""",
  197. "prompt_type": "code_generation",
  198. "complexity": "medium"
  199. },
  200. # File Handling
  201. {
  202. "text": "Write a Python function to read a CSV file and return it as a list of dictionaries.",
  203. "expected_answer": """import csv
  204. def read_csv(file_path):
  205. data = []
  206. try:
  207. with open(file_path, 'r') as file:
  208. reader = csv.DictReader(file)
  209. for row in reader:
  210. data.append(row)
  211. return data
  212. except Exception as e:
  213. print(f"Error reading CSV: {e}")
  214. return None""",
  215. "prompt_type": "code_generation",
  216. "complexity": "medium"
  217. },
  218. # Error Handling
  219. {
  220. "text": "Write a Python function that safely converts a string to integer with error handling.",
  221. "expected_answer": """def safe_int_convert(s):
  222. try:
  223. return int(s), None
  224. except ValueError as e:
  225. return None, str(e)""",
  226. "prompt_type": "code_generation",
  227. "complexity": "simple"
  228. },
  229. # Complex Algorithm
  230. {
  231. "text": "Implement a Python function for Depth-First Search on a graph.",
  232. "expected_answer": """def dfs(graph, start, visited=None):
  233. if visited is None:
  234. visited = set()
  235. visited.add(start)
  236. for next_node in graph[start]:
  237. if next_node not in visited:
  238. dfs(graph, next_node, visited)
  239. return visited""",
  240. "prompt_type": "code_generation",
  241. "complexity": "complex"
  242. }
  243. ]
  244. def get_eval_subset(prompt_type: str = None, complexity: str = None) -> List[Dict]:
  245. """
  246. Returns a filtered subset of the evaluation dataset based on prompt type and/or complexity.
  247. Args:
  248. prompt_type: Type of prompts to filter (e.g., 'summarization', 'qa', etc.)
  249. complexity: Complexity level to filter (e.g., 'simple', 'medium', 'complex')
  250. """
  251. dataset = get_evaluation_dataset()
  252. if prompt_type:
  253. dataset = [d for d in dataset if d["prompt_type"] == prompt_type]
  254. if complexity:
  255. dataset = [d for d in dataset if d["complexity"] == complexity]
  256. return dataset