preprocess-json-agentic.py 973 B

12345678910111213141516171819202122232425262728293031323334353637
  1. import json
  2. from datasets import DatasetDict, load_from_disk
  3. def preprocess_conversation(example):
  4. # Convert roles
  5. for conv in example["conversations"]:
  6. if conv["from"] == "human":
  7. conv["from"] = "user"
  8. elif conv["from"] == "gpt":
  9. conv["from"] = "assistant"
  10. return example
  11. def transform_conversations(example):
  12. """Transform conversations list to string format."""
  13. conv_str = "\n".join(
  14. f"{msg['from']}: {msg['value']}" for msg in example["conversations"]
  15. )
  16. return {"id": example["id"], "conversations": conv_str}
  17. # Load dataset
  18. json_balanced = load_from_disk("balanced-json-modeagentic")
  19. # Apply preprocessing
  20. processed_dataset = json_balanced.map(preprocess_conversation)
  21. processed_dataset = processed_dataset["train"].map(
  22. transform_conversations, remove_columns=["category", "subcategory", "schema"]
  23. )
  24. # Save dataset
  25. processed_dataset.save_to_disk("json-agentic-balanced-final")