|
@@ -0,0 +1,36 @@
|
|
|
+import json
|
|
|
+
|
|
|
+from datasets import DatasetDict, load_from_disk
|
|
|
+
|
|
|
+
|
|
|
+def preprocess_conversation(example):
|
|
|
+ # Convert roles
|
|
|
+ for conv in example["conversations"]:
|
|
|
+ if conv["from"] == "human":
|
|
|
+ conv["from"] = "user"
|
|
|
+ elif conv["from"] == "gpt":
|
|
|
+ conv["from"] = "assistant"
|
|
|
+ return example
|
|
|
+
|
|
|
+
|
|
|
+def transform_conversations(example):
|
|
|
+ """Transform conversations list to string format."""
|
|
|
+ conv_str = "\n".join(
|
|
|
+ f"{msg['from']}: {msg['value']}" for msg in example["conversations"]
|
|
|
+ )
|
|
|
+ return {"id": example["id"], "conversations": conv_str}
|
|
|
+
|
|
|
+
|
|
|
+# Load dataset
|
|
|
+json_balanced = load_from_disk("balanced-json-modeagentic")
|
|
|
+
|
|
|
+# Apply preprocessing
|
|
|
+processed_dataset = json_balanced.map(preprocess_conversation)
|
|
|
+
|
|
|
+processed_dataset = processed_dataset["train"].map(
|
|
|
+ transform_conversations, remove_columns=["category", "subcategory", "schema"]
|
|
|
+)
|
|
|
+
|
|
|
+
|
|
|
+# Save dataset
|
|
|
+processed_dataset.save_to_disk("json-agentic-balanced-final")
|