data-prep-toolace.py 1.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960
  1. #!/usr/bin/env python3
  2. import argparse
  3. import uuid
  4. from datasets import Dataset, load_dataset
  5. from tqdm import tqdm
  6. def transform_dataset(dataset_name, output_path):
  7. print(f"Loading dataset: {dataset_name}")
  8. dataset = load_dataset(dataset_name)
  9. print(f"Loaded dataset with {len(dataset['train'])} examples")
  10. new_data = {"id": [], "conversations": []}
  11. print("Transforming dataset...")
  12. for example in tqdm(dataset["train"], desc="Processing examples"):
  13. new_data["id"].append(str(uuid.uuid4()))
  14. transformed_conv = [{"from": "system", "value": example["system"]}] + example[
  15. "conversations"
  16. ]
  17. new_data["conversations"].append(transformed_conv)
  18. print("Creating new dataset...")
  19. new_dataset = Dataset.from_dict(new_data)
  20. print(f"Saving dataset to: {output_path}")
  21. new_dataset.save_to_disk(output_path)
  22. print(f"Successfully transformed {len(new_dataset)} examples")
  23. return new_dataset
  24. def main():
  25. parser = argparse.ArgumentParser(
  26. description="Transform dataset to conversation format"
  27. )
  28. parser.add_argument(
  29. "--dataset",
  30. type=str,
  31. default="Team-ACE/ToolACE",
  32. help="HuggingFace dataset name (default: Team-ACE/ToolACE)",
  33. )
  34. parser.add_argument(
  35. "--output",
  36. type=str,
  37. default="transformed_dataset",
  38. help="Output path for transformed dataset (default: transformed_dataset)",
  39. )
  40. args = parser.parse_args()
  41. transform_dataset(dataset_name=args.dataset, output_path=args.output)
  42. if __name__ == "__main__":
  43. main()