فهرست منبع

Enable users to trust remote code in samsum dataset (#628)

Matthias Reso 8 ماه پیش
والد
کامیت
48ba6805af
2فایلهای تغییر یافته به همراه5 افزوده شده و 2 حذف شده
  1. 2 1
      src/llama_recipes/configs/datasets.py
  2. 3 1
      src/llama_recipes/datasets/samsum_dataset.py

+ 2 - 1
src/llama_recipes/configs/datasets.py

@@ -9,6 +9,7 @@ class samsum_dataset:
     dataset: str =  "samsum_dataset"
     train_split: str = "train"
     test_split: str = "validation"
+    trust_remote_code: bool = False
 
 
 @dataclass
@@ -37,4 +38,4 @@ class custom_dataset:
 class llamaguard_toxicchat_dataset:
     dataset: str = "llamaguard_toxicchat_dataset"
     train_split: str = "train"
-    test_split: str = "test"
+    test_split: str = "test"

+ 3 - 1
src/llama_recipes/datasets/samsum_dataset.py

@@ -8,7 +8,9 @@ import datasets
 
 
 def get_preprocessed_samsum(dataset_config, tokenizer, split):
-    dataset = datasets.load_dataset("samsum", split=split)
+    if not hasattr(dataset_config, "trust_remote_code") or not dataset_config.trust_remote_code:
+        raise ValueError("The repository for samsum contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/samsum. To activate `trust_remote_code` option use this config: --samsum_dataset.trust_remote_code=True")
+    dataset = datasets.load_dataset("samsum", split=split, trust_remote_code=dataset_config.trust_remote_code)
 
     prompt = (
         f"Summarize this dialog:\n{{dialog}}\n---\nSummary:\n"