浏览代码

update preprocess

Sanyam Bhutani 5 月之前
父节点
当前提交
d163df8d36

文件差异内容过多而无法显示
+ 346 - 0
end-to-end-use-cases/data-tool/EDA/Pre-Process-Nous.ipynb


+ 36 - 0
end-to-end-use-cases/data-tool/dataprep-scripts/preprocess-json-agentic.py

@@ -0,0 +1,36 @@
+import json
+
+from datasets import DatasetDict, load_from_disk
+
+
+def preprocess_conversation(example):
+    # Convert roles
+    for conv in example["conversations"]:
+        if conv["from"] == "human":
+            conv["from"] = "user"
+        elif conv["from"] == "gpt":
+            conv["from"] = "assistant"
+    return example
+
+
+def transform_conversations(example):
+    """Transform conversations list to string format."""
+    conv_str = "\n".join(
+        f"{msg['from']}: {msg['value']}" for msg in example["conversations"]
+    )
+    return {"id": example["id"], "conversations": conv_str}
+
+
+# Load dataset
+json_balanced = load_from_disk("balanced-json-modeagentic")
+
+# Apply preprocessing
+processed_dataset = json_balanced.map(preprocess_conversation)
+
+processed_dataset = processed_dataset["train"].map(
+    transform_conversations, remove_columns=["category", "subcategory", "schema"]
+)
+
+
+# Save dataset
+processed_dataset.save_to_disk("json-agentic-balanced-final")