_preprocess-json-agentic.py 899 B

12345678910111213141516171819202122232425262728293031
  1. import json
  2. from datasets import DatasetDict, load_from_disk
  3. def preprocess_conversation(example):
  4. for conv in example["conversations"]:
  5. if conv["from"] == "human":
  6. conv["from"] = "user"
  7. elif conv["from"] == "gpt":
  8. conv["from"] = "assistant"
  9. return example
  10. def transform_conversations(example):
  11. """Transform conversations list to string format."""
  12. conv_str = "\n".join(
  13. f"{msg['from']}: {msg['value']}" for msg in example["conversations"]
  14. )
  15. return {"id": example["id"], "conversations": conv_str}
  16. json_balanced = load_from_disk("balanced-json-modeagentic")
  17. processed_dataset = json_balanced.map(preprocess_conversation)
  18. processed_dataset = processed_dataset["train"].map(
  19. transform_conversations, remove_columns=["category", "subcategory", "schema"]
  20. )
  21. processed_dataset.save_to_disk("json-agentic-balanced-final")