radu
/
LLamaRecipes
mirror da https://github.com/facebookresearch/llama-recipes.git


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123
							from typing import List, Dict

def get_evaluation_dataset() -> List[Dict]:
    """
    Returns a comprehensive evaluation dataset for testing prompt migrations.
    Each test case includes:
    - text: Input text
    - expected_summary: Expected output
    - prompt_type: Type of prompt (summarization, classification, qa, etc.)
    - complexity: Difficulty level (simple, medium, complex)
    """
    return [
        # Summarization examples
        {
            "text": "The quick brown fox jumps over the lazy dog.",
            "expected_summary": "A fox jumps over a dog.",
            "prompt_type": "summarization",
            "complexity": "simple"
        },
        {
            "text": """Machine learning is a subset of artificial intelligence that focuses on developing 
                   systems that can learn from and make decisions based on data. It has numerous 
                   applications in various fields including healthcare, finance, and autonomous vehicles.""",
            "expected_summary": "Machine learning is an AI technology that enables systems to learn and make decisions from data, used in healthcare, finance, and autonomous vehicles.",
            "prompt_type": "summarization",
            "complexity": "medium"
        },

        # Classification examples
        {
            "text": "I absolutely loved this product! Best purchase ever!",
            "expected_summary": "Positive",
            "prompt_type": "sentiment_classification",
            "complexity": "simple"
        },
        {
            "text": "The product works fine but the customer service could be better.",
            "expected_summary": "Neutral",
            "prompt_type": "sentiment_classification",
            "complexity": "medium"
        },

        # Question-Answering examples
        {
            "text": "What is the capital of France? Context: Paris is the capital and largest city of France.",
            "expected_summary": "Paris",
            "prompt_type": "qa",
            "complexity": "simple"
        },
        {
            "text": """What causes rain? Context: Rain is precipitation of liquid water in the form of droplets. 
                   Water vapor in warm air rises and cools, forming clouds. When the droplets become too 
                   heavy, they fall as rain.""",
            "expected_summary": "Rain occurs when water vapor in warm air rises, cools to form clouds, and droplets become heavy enough to fall.",
            "prompt_type": "qa",
            "complexity": "medium"
        },

        # Code-related examples
        {
            "text": "Write a function to add two numbers in Python.",
            "expected_summary": "def add(a, b):\n    return a + b",
            "prompt_type": "code_generation",
            "complexity": "simple"
        },
        {
            "text": "Explain what this code does: for i in range(len(arr)): arr[i] *= 2",
            "expected_summary": "This code multiplies each element in the array 'arr' by 2.",
            "prompt_type": "code_explanation",
            "complexity": "simple"
        },

        # Text transformation examples
        {
            "text": "convert this to passive voice: The cat chased the mouse.",
            "expected_summary": "The mouse was chased by the cat.",
            "prompt_type": "text_transformation",
            "complexity": "simple"
        },
        {
            "text": "translate to French: Hello, how are you?",
            "expected_summary": "Bonjour, comment allez-vous?",
            "prompt_type": "translation",
            "complexity": "simple"
        },

        # Complex reasoning examples
        {
            "text": """A train leaves Station A at 2:00 PM traveling at 60 mph. Another train leaves 
                   Station B at 3:00 PM traveling at 75 mph in the opposite direction. If the stations 
                   are 375 miles apart, at what time will the trains meet?""",
            "expected_summary": "The trains will meet at 5:00 PM.",
            "prompt_type": "problem_solving",
            "complexity": "complex"
        },
        {
            "text": """Analyze the environmental impact of electric vehicles versus traditional 
                   gasoline vehicles, considering manufacturing, operation, and disposal.""",
            "expected_summary": """Electric vehicles typically have higher manufacturing emissions but lower 
                              operational emissions compared to gasoline vehicles. Overall lifecycle 
                              environmental impact depends on electricity source and battery recycling.""",
            "prompt_type": "analysis",
            "complexity": "complex"
        }
    ]

def get_eval_subset(prompt_type: str = None, complexity: str = None) -> List[Dict]:
    """
    Returns a filtered subset of the evaluation dataset based on prompt type and/or complexity.
    
    Args:
        prompt_type: Type of prompts to filter (e.g., 'summarization', 'qa', etc.)
        complexity: Complexity level to filter (e.g., 'simple', 'medium', 'complex')
    """
    dataset = get_evaluation_dataset()
    
    if prompt_type:
        dataset = [d for d in dataset if d["prompt_type"] == prompt_type]
    
    if complexity:
        dataset = [d for d in dataset if d["complexity"] == complexity]
    
    return dataset