radu
/
LLamaRecipes
spiegel van https://github.com/facebookresearch/llama-recipes.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287
							# VLLM_WORKER_MULTIPROC_METHOD=spawn python scripts/add_cot_vllm.py --model_id meta-llama/Llama-3.3-70B-Instruct --dataset-path datasets/2_ready_for_CoT/func-calling-multi-turn-final/ --config configs/config.yaml --output-path "datasets/3_CoT_added/func-calling-multi-turn-final/" --batch-size 96 --max-seq-len 16000


import argparse
import json
import os
import random
import re
from typing import Any, Dict, List

import torch
import yaml
from datasets import Dataset, load_from_disk
from tqdm import tqdm
from transformers import AutoProcessor
from vllm import LLM, SamplingParams


class LLM_Singleton:
    _instance = None

    def __new__(
        cls,
        model_id,
        max_model_len=64000,
        max_num_seqs=16,
        enforce_eager=True,
        debug=False,
    ):
        if cls._instance is None:
            cls._instance = super(LLM_Singleton, cls).__new__(cls)
            cls._instance._initialize(
                model_id,
                tensor_parallel_size=torch.cuda.device_count(),
                max_model_len=max_model_len,
                max_num_seqs=max_num_seqs,
                enforce_eager=enforce_eager,
                debug=debug,
            )
        return cls._instance

    def _initialize(
        self,
        model_id,
        tensor_parallel_size=1,
        max_model_len=64000,
        max_num_seqs=16,
        enforce_eager=True,
        debug=False,
    ):
        if debug:
            print(
                f"Initializing LLM with params: {model_id}, {tensor_parallel_size}, {max_model_len}"
            )

        self.llm = LLM(
            model_id,
            tensor_parallel_size=tensor_parallel_size,
            max_model_len=max_model_len,
            max_num_seqs=max_num_seqs,
            enforce_eager=enforce_eager,
            gpu_memory_utilization=0.95,
        )
        self.processor = AutoProcessor.from_pretrained(model_id)


def load_system_prompt(yaml_path: str) -> str:
    """Load system prompt from YAML config."""
    with open(yaml_path, "r") as f:
        config = yaml.safe_load(f)
    return config["system_prompt"]


def create_chat_message(system_prompt: str, conversation: str) -> List[Dict[str, Any]]:
    """Create properly formatted chat messages."""
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": conversation},
    ]
    return messages


def parse_json_output(output_text: str) -> List[Dict[str, str]]:
    """Parse and clean model output to ensure valid JSON."""
    output_text = output_text.strip()
    json_match = re.search(r"\[.*\]", output_text, re.DOTALL)

    if json_match:
        output_text = json_match.group(0)

    try:
        if output_text.startswith('"') and output_text.endswith('"'):
            output_text = json.loads(output_text)
        result = json.loads(output_text)

        # Clean the result to remove 'tool': None entries
        cleaned_result = []
        for item in result:
            cleaned_item = {
                k: v for k, v in item.items() if k != "tool" or v is not None
            }
            cleaned_result.append(cleaned_item)

        return cleaned_result
    except json.JSONDecodeError as e:
        print(f"Error parsing output: {e}")
        return None


def process_dataset(
    dataset,
    system_prompt: str,
    start_index: int = 0,
    end_index: int = None,
    n_samples: int = 0,
    model_instance: Any = None,
    batch_size: int = 16,
    max_seq_len: int = 64000,
) -> List[Dict]:
    """Process the dataset in parallel batches and return results."""
    if end_index is None:
        end_index = len(dataset)
    else:
        end_index = min(end_index, len(dataset))

    # Handle random sampling
    dataset_size = end_index - start_index
    if n_samples > 0:
        # If n_samples is larger than dataset size, use full dataset size
        n_samples = min(n_samples, dataset_size)
        # Generate random indices within the specified range
        indices = random.sample(range(start_index, end_index), n_samples)
        dataset_slice = dataset.select(indices)
    else:
        # If no sampling requested, use the full range
        dataset_slice = dataset.select(range(start_index, end_index))

    results = []

    for i in tqdm(range(0, len(dataset_slice), batch_size), desc=f"Processing batches"):
        batch_slice = dataset_slice.select(
            range(i, min(i + batch_size, len(dataset_slice)))
        )

        try:
            batch_inputs = []
            for item in batch_slice:
                conversation_str = json.dumps(
                    item["conversations"], ensure_ascii=False, indent=2
                )
                messages = create_chat_message(system_prompt, conversation_str)
                input_text = model_instance.processor.apply_chat_template(
                    messages, add_generation_prompt=True, tokenize=False
                )
                batch_inputs.append(input_text)

            # max_tokens here is per-batch generation limit
            sampling_params = SamplingParams(
                max_tokens=max_seq_len, temperature=0.1, top_p=0.95
            )

            outputs = model_instance.llm.generate(batch_inputs, sampling_params)

            for item, output in zip(batch_slice, outputs):
                enhanced_convos = parse_json_output(output.outputs[0].text.strip())
                if enhanced_convos is None:
                    print(
                        f"Warning: Failed to parse output for item {item.get('id', 'unknown')}"
                    )
                    enhanced_convos = item["conversations"]

                results.append(
                    {
                        "id": item["id"],
                        "conversations": item["conversations"],
                        "cot_conversations": enhanced_convos,
                    }
                )

        except Exception as e:
            print(f"Error processing batch starting at {i}: {str(e)}")
            continue

    return results


def main():
    parser = argparse.ArgumentParser(
        description="Process dataset to enhance conversations with CoT reasoning"
    )
    parser.add_argument(
        "--model_id", type=str, required=True, help="Model name or path"
    )
    parser.add_argument(
        "--config",
        type=str,
        required=True,
        help="Path to YAML config with system prompt",
    )
    parser.add_argument(
        "--output-path",
        type=str,
        required=True,
        help="Output dataset directory path",
    )
    parser.add_argument(
        "--dataset-path", type=str, required=True, help="Input dataset path"
    )
    parser.add_argument(
        "--start-index", type=int, default=0, help="Starting index (inclusive)"
    )
    parser.add_argument("--end-index", type=int, help="Ending index (exclusive)")
    parser.add_argument(
        "--n-samples",
        type=int,
        default=0,
        help="Number of random samples to process. If 0, process all samples in range",
    )
    parser.add_argument(
        "--batch-size", type=int, default=16, help="Batch size for processing"
    )
    parser.add_argument("--debug", action="store_true", help="Enable debug mode")
    parser.add_argument(
        "--max-seq-len",
        type=int,
        default=64000,
        help="Maximum sequence length for generation per batch",
    )
    parser.add_argument(
        "--max-num-seqs",
        type=int,
        default=16,
        help="Maximum number of sequences in batch",
    )
    parser.add_argument(
        "--enforce-eager",
        action="store_true",
        help="Whether to enforce eager execution",
    )
    args = parser.parse_args()

    # Set spawn method for multiprocessing
    os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

    # Load system prompt and dataset
    system_prompt = load_system_prompt(args.config)
    dataset = load_from_disk(args.dataset_path)
    if isinstance(dataset, dict):
        dataset = dataset["train"]

    # Initialize VLLM instance
    model_instance = LLM_Singleton(
        model_id=args.model_id,
        max_model_len=args.max_seq_len,  # Use the passed max_seq_len
        max_num_seqs=16,
        enforce_eager=True,
        debug=args.debug,
    )

    # Process dataset and get results
    results = process_dataset(
        dataset=dataset,
        system_prompt=system_prompt,
        start_index=args.start_index,
        end_index=args.end_index,
        n_samples=args.n_samples,
        model_instance=model_instance,
        batch_size=args.batch_size,
        max_seq_len=args.max_seq_len,
    )

    # Convert results to HuggingFace dataset
    output_dataset = Dataset.from_dict(
        {
            "id": [r["id"] for r in results],
            "conversations": [r["conversations"] for r in results],
            "cot_conversations": [r["cot_conversations"] for r in results],
        }
    )

    # Save dataset
    output_dataset.save_to_disk(args.output_path)


if __name__ == "__main__":
    main()