pre-process-ToolAce.py 850 B

123456789101112131415161718192021222324252627282930313233
  1. import json
  2. import re
  3. import uuid
  4. from collections import Counter, defaultdict
  5. from typing import Dict, List
  6. import matplotlib.pyplot as plt
  7. import networkx as nx
  8. import numpy as np
  9. import pandas as pd
  10. import seaborn as sns
  11. from datasets import Dataset, load_dataset
  12. from tqdm import tqdm
  13. dataset = load_dataset("Team-ACE/ToolACE")
  14. # Transform data
  15. new_data = {"id": [], "conversations": []}
  16. # Process each example
  17. for example in dataset["train"]:
  18. # Add system message to conversations and create new structure
  19. new_data["id"].append(str(uuid.uuid4()))
  20. new_data["conversations"].append(
  21. [{"from": "system", "value": example["system"]}] + example["conversations"]
  22. )
  23. # Create new dataset with just id and conversations
  24. new_dataset = Dataset.from_dict(new_data)
  25. # Save it
  26. new_dataset.save_to_disk("transformed_toolace-new")